In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
from datetime import datetime
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
# from src.model import *
# from src.util import *
os.environ["CUDA_VISIBLE_DEVICES"] = '1'

In [2]:
trained_model_path = '/workdir/security/home/junjiehuang2468/paper/trained_models_weight/kaggle_miscrosoft/'
best_trained_model = '2022-01-18 19:10/50w_epoch:0_test_acc:0.905704.pt'
data_path = "/workdir/security/home/junjiehuang2468/paper/data/kaggle/"
train_data_path = data_path + "malwares/"  # Training data
train_label_path = data_path + "train_labels.csv"  # Training label

In [3]:
result_path = '/workdir/security/home/junjiehuang2468/paper/results/kaggle_miscrosoft/'

In [4]:
CUDA = True if torch.cuda.is_available() else False
NUM_WORKERS = 16  # Number of cores to use for data loader
BATCH_SIZE = 3  #
LEAVE_BIT_NUMBER = 500000
KERNEL_SIZE = 500  # Kernel size & stride for Malconv (defualt : 500)

In [5]:
trainset = pd.read_csv(data_path + 'train_dataset.csv')
validset = pd.read_csv(data_path + 'valid_dataset.csv')

In [6]:
class ExeDataset(Dataset):
    def __init__(self, malware_names, data_path, labels, leave_bit_num):
        self.malware_names = malware_names
        self.data_path = data_path
        self.labels = labels
        self.leave_bit_num = leave_bit_num

    def __len__(self):
        return len(self.malware_names)

    def __getitem__(self, idx):
        with open(self.data_path + self.malware_names[idx] + '.txt','rb') as fp:
            data = [bit+1 for bit in fp.read()[:self.leave_bit_num]]
            padding = [0]*(self.leave_bit_num-len(data))
            data = data + padding

        return np.array(data), np.array([self.labels[idx]])

In [7]:
train_dataset = ExeDataset(
    trainset["id"].tolist(), 
    train_data_path, 
    trainset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)
valid_dataset = ExeDataset(
    validset["id"].tolist(), 
    train_data_path, 
    validset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)

In [8]:
trainloader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = NUM_WORKERS,
    pin_memory = True
)
validloader = DataLoader(
    dataset = valid_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = NUM_WORKERS,
    pin_memory = True
)

In [9]:
def mp_func(i,inpu,te,gr):
    check = 0
    grad_cum = 0
    for j,(inp,g,t) in enumerate(zip(inpu,gr,te)):
        if inp != 0: 
            check = j
            continue
        max_idx = np.argmin(g).tolist()
        org_max_idx = np.argmax(t).tolist()
        if g[max_idx] > 0: continue
        grad_cum += g[max_idx]
        te[j][org_max_idx] = 0
        te[j][max_idx] = 1
    return [i,te,check,grad_cum]

In [10]:
# class Model(nn.Module):
#     def __init__(self, data_length = 2e6, kernel_size = 500):
#         super().__init__()
#         self.embedding = nn.Embedding(257, 8, padding_idx=0)
#         self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         # self.bn_1 = nn.BatchNorm1d(128)
#         self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
#         self.fc_layer_3 = nn.Linear(128, 128)
#         self.fc_layer_4 = nn.Linear(128, 9)
        
#     def forward(self,x):
#         x = self.embedding(x)
#         x = x.transpose(-1,-2)
#         x_conv_1 = self.conv_layer_1(x[:,:4,:])
#         x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))
#         x = x_conv_1*x_conv_2
#         del x_conv_1,x_conv_2
#         x = self.pool_layer_2(x).squeeze()
#         x = self.fc_layer_3(x)
#         x = self.fc_layer_4(x)
#         # x = torch.sigmoid(x)
#         return x

In [11]:
class Model(nn.Module):
    def __init__(self, data_length = 2e6, kernel_size = 500):
        super().__init__()
        self.embedding = nn.Embedding(257, 8, padding_idx=0)
        self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        # self.bn_1 = nn.BatchNorm1d(128)
        self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
        self.fc_layer_3 = nn.Linear(128, 128)
        self.fc_layer_4 = nn.Linear(128, 9)
    
    def forward(self, input_, loss_fn, fake_label, label):
        temp = F.one_hot(input_,num_classes=257).float()
        temp.requires_grad = True
        temp.retain_grad()
        for _ in range(10):
            x = temp @ self.embedding.weight
            x = x.transpose(-1,-2)
            x_conv_1 = self.conv_layer_1(x[:,:4,:])
            x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))
            x = x_conv_1*x_conv_2
            del x_conv_1,x_conv_2
            x = self.pool_layer_2(x).squeeze()
            x = self.fc_layer_3(x)
            x = self.fc_layer_4(x)
            # fake_label = torch.zeros_like(x)
            print((torch.argmax(torch.softmax(x,dim=-1),dim=-1).tolist() , label.tolist()))
            loss = loss_fn(x,fake_label).cuda()
            print(loss)
            loss.backward()
            data = [(i,inpu,te,gr) for i,(inpu,te,gr) in enumerate(zip(
                input_.detach().cpu().numpy(),
                temp.detach().cpu().numpy(),
                temp.grad.detach().cpu().numpy()
            ))]
            with mp.Pool(processes=24 if len(data) > 24 else len(data)) as pool:
                results = pool.starmap(mp_func,data)
            
            check = [r[2] for r in results]
            results = sorted(results,key = lambda x: x[0])
            print([(r[2],temp.grad[i,r[2]:,:].sum(-1).sum(-1).tolist()) for i,r in enumerate(results)])
            if sum(check)/len(check) == LEAVE_BIT_NUMBER-1: break
            for i in range(len(temp)):
                temp.data[i] = torch.tensor(results[i][1], dtype=torch.float, requires_grad=True).cuda()
                
        return x.cpu().detach().numpy(),temp,check

In [12]:
# class Model(nn.Module):
#     def __init__(self, data_length = 2e6, kernel_size = 500):
#         super().__init__()
#         self.embedding = nn.Embedding(257, 8, padding_idx=0)
#         self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         # self.bn_1 = nn.BatchNorm1d(128)
#         self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
#         self.fc_layer_3 = nn.Linear(128, 128)
#         self.fc_layer_4 = nn.Linear(128, 9)
    
#     def forward(self, input_, loss_fn):
#         temp = F.one_hot(input_,num_classes=257).float()
#         temp.requires_grad = True
#         temp.retain_grad()
#         for _ in range(6):
#             x = temp @ self.embedding.weight
#             x = torch.transpose(x, -1, -2)

#             x_conv_1 = self.conv_layer_1(x[:,:4,:])
#             x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))

#             x = x_conv_1*x_conv_2
#             del x_conv_1,x_conv_2
#             x = self.pool_layer_2(x).squeeze()
            
#             x = self.fc_layer_3(x)
#             x = self.fc_layer_4(x)
            
#             print(torch.argmax(torch.softmax(x,dim=-1),dim=-1).float().mean())
            
#             mislead_labels = torch.zeros_like(x).cuda()
#             loss = loss_fn(x,mislead_labels)
#             loss.backward()
            
#             data = [(i,inpu,te,gr) for i,(inpu,te,gr) in enumerate(zip(
#                 input_.detach().cpu().numpy(),
#                 temp.detach().cpu().numpy(),
#                 temp.grad.detach().cpu().numpy()
#             ))]
#             with mp.Pool(processes=24 if len(data) > 24 else len(data)) as pool:
#                 results = pool.starmap(mp_func,data)
#             results = sorted(results,key = lambda x: x[0])
#             for i in range(len(temp)):
#                 temp.data[i] = torch.tensor(results[i][1], dtype=torch.float, requires_grad=True).cuda()
        
#         return x.detach().cpu().numpy(),temp

In [13]:
model = Model(data_length=LEAVE_BIT_NUMBER,kernel_size=KERNEL_SIZE)

ce_loss = nn.CrossEntropyLoss()
optim = Adam(model.parameters())

model = model.cuda() if CUDA else model
ce_loss = ce_loss.cuda() if CUDA else ce_less

In [14]:
model.load_state_dict(torch.load(trained_model_path + best_trained_model))

<All keys matched successfully>

In [15]:
if __name__=='__main__':
    model.eval()
    acc = []
    preds = []
    labels = []
    for step, (batch_data,batch_label) in enumerate(tqdm(validloader)):
        batch_data = batch_data.cuda() if CUDA else batch_data
        batch_label = batch_label.cuda() if CUDA else batch_label
        batch_label = batch_label.squeeze() - 1
        fake_label = torch.zeros_like(batch_label)
        pred,temp,check = model(batch_data,ce_loss,fake_label,batch_label)
        if sum(check)/len(check) == LEAVE_BIT_NUMBER-1: continue
            
        pred = np.argmax(pred,1)
        batch_label = batch_label.cpu().data.numpy()

        preds.extend(pred.tolist())
        labels.extend(batch_label.tolist())

        count = total = 0
        for c,ans in zip(check,batch_label == pred):
            if c != LEAVE_BIT_NUMBER-1:
                count += ans
                total += 1
        acc.append(count/total)
        print(f"test：{count/total}, test mean: {np.mean(acc)}")

  0%|          | 0/725 [00:00<?, ?it/s]

([2, 2, 2], [2, 2, 2])
tensor(13.0813, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  0%|          | 1/725 [00:26<5:20:14, 26.54s/it]

([1, 0, 5], [1, 0, 5])
tensor(4.4735, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.12569290399551392), (499999, 0.0)]
([1, 7, 5], [1, 0, 5])
tensor(4.8047, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 3.6421713829040527), (499999, 0.0)]
([1, 0, 5], [1, 0, 5])
tensor(4.4591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 4.033941268920898), (499999, 0.0)]
([1, 7, 5], [1, 0, 5])
tensor(5.2205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 1.241539716720581), (499999, 0.0)]
([1, 0, 5], [1, 0, 5])
tensor(4.4260, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 1.343736171722412), (499999, 0.0)]
([1, 0, 5], [1, 0, 5])
tensor(4.4254, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 1.4270483255386353), (499999, 0.0)]
([1, 0, 5], [1, 0, 5])
tensor(4.4251, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 1.5045229196548462), (499999, 

  0%|          | 2/725 [05:05<35:08:14, 174.96s/it]

test：1.0, test mean: 1.0
([2, 1, 8], [2, 1, 8])
tensor(7.3896, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  0%|          | 3/725 [05:30<21:19:41, 106.35s/it]

([2, 7, 8], [2, 7, 0])
tensor(7.4023, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21019810438156128), (499999, 0.0)]
([2, 7, 8], [2, 7, 0])
tensor(7.4906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.092628479003906), (499999, 0.0)]
([2, 7, 8], [2, 7, 0])
tensor(5.9455, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.873405933380127), (499999, 0.0)]
([2, 0, 8], [2, 7, 0])
tensor(4.4140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.659365653991699), (499999, 0.0)]
([2, 0, 8], [2, 7, 0])
tensor(4.3744, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.063767433166504), (499999, 0.0)]
([2, 0, 8], [2, 7, 0])
tensor(4.3609, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.2313127517700195), (499999, 0.0)]
([2, 0, 8], [2, 7, 0])
tensor(4.3561, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.381188869476318), (4999

  1%|          | 4/725 [10:08<34:54:35, 174.31s/it]

test：0.0, test mean: 0.5
([2, 0, 1], [2, 1, 1])
tensor(6.6998, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 5/725 [10:33<24:03:37, 120.30s/it]

([3, 1, 1], [3, 1, 1])
tensor(3.8742, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -0.5786563158035278), (499999, 0.0), (499999, 0.0)]
([3, 1, 1], [3, 1, 1])
tensor(3.8774, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -2.8540782928466797), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [3, 1, 1])
tensor(3.1903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -3.13746976852417), (499999, 0.0), (499999, 0.0)]
([1, 1, 1], [3, 1, 1])
tensor(3.0741, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -0.1527186930179596), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [3, 1, 1])
tensor(2.7938, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 0.7315129041671753), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [3, 1, 1])
tensor(2.7671, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 1.117597222328186), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [3, 1, 1])
tensor(2.7545, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 1.246053695678711), (499999, 0.0), (499999

  1%|          | 6/725 [15:08<34:33:04, 173.00s/it]

test：0.0, test mean: 0.3333333333333333
([2, 3, 8], [2, 3, 8])
tensor(7.3371, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 7/725 [15:33<24:52:23, 124.71s/it]

([1, 7, 5], [1, 7, 3])
tensor(7.1047, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21019919216632843), (499999, 0.0)]
([1, 7, 5], [1, 7, 3])
tensor(7.1330, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.092445373535156), (499999, 0.0)]
([1, 7, 5], [1, 7, 3])
tensor(5.6585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.873831748962402), (499999, 0.0)]
([1, 0, 5], [1, 7, 3])
tensor(4.1296, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.992379188537598), (499999, 0.0)]
([1, 0, 5], [1, 7, 3])
tensor(4.0706, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.143834590911865), (499999, 0.0)]
([1, 0, 5], [1, 7, 3])
tensor(4.0643, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.168493270874023), (499999, 0.0)]
([1, 0, 5], [1, 7, 3])
tensor(4.0632, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.179717063903809), (49999

  1%|          | 8/725 [20:21<35:11:47, 176.72s/it]

test：0.0, test mean: 0.25
([1, 7, 1], [1, 7, 1])
tensor(6.1104, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2098156362771988), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(6.0518, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.429378032684326), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(4.7394, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.875401496887207), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(3.6172, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.8988202810287476), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(3.6142, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.9084076881408691), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(3.6127, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.9182312488555908), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(3.6117, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.9

  1%|          | 9/725 [25:00<41:30:31, 208.70s/it]

test：0.0, test mean: 0.2
([2, 2, 1], [2, 2, 1])
tensor(10.2685, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|▏         | 10/725 [25:27<30:18:30, 152.60s/it]

([7, 0, 2], [7, 0, 2])
tensor(4.8928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 11/725 [25:54<22:37:59, 114.12s/it]

([7, 1, 6], [7, 1, 6])
tensor(5.5950, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21018444001674652), (499999, 0.0), (499999, 0.0)]
([7, 1, 6], [7, 1, 6])
tensor(5.5812, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.9860258102417), (499999, 0.0), (499999, 0.0)]
([7, 1, 6], [7, 1, 6])
tensor(3.9292, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.533359050750732), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [7, 1, 6])
tensor(2.5176, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.545135974884033), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [7, 1, 6])
tensor(2.4315, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.665031909942627), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [7, 1, 6])
tensor(2.4232, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.71811056137085), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [7, 1, 6])
tensor(2.4161, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.761135101318359), (499999, 0.0), (499999, 

  2%|▏         | 12/725 [30:25<32:02:25, 161.77s/it]

test：0.0, test mean: 0.16666666666666666
([1, 8, 1], [1, 8, 1])
tensor(5.0491, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 13/725 [30:50<23:50:03, 120.51s/it]

([1, 2, 5], [1, 2, 8])
tensor(6.2395, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 14/725 [31:18<18:13:52, 92.31s/it] 

([2, 2, 1], [2, 2, 1])
tensor(11.0351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 15/725 [31:44<14:16:22, 72.37s/it]

([1, 5, 2], [1, 8, 2])
tensor(7.7525, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 16/725 [32:09<11:28:09, 58.24s/it]

([2, 2, 7], [2, 2, 7])
tensor(12.2301, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 3.782251224038191e-05), (499999, 0.0), (380927, 0.2101878970861435)]
([2, 2, 7], [2, 2, 7])
tensor(12.2305, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 7.564502448076382e-05), (499999, 0.0), (380927, -9.091907501220703)]
([2, 2, 7], [2, 2, 7])
tensor(10.6522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00011346752580720931), (499999, 0.0), (380927, -4.906617164611816)]
([2, 2, 0], [2, 2, 7])
tensor(9.3232, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00015129004896152765), (499999, 0.0), (380927, -4.805576324462891)]
([2, 2, 0], [2, 2, 7])
tensor(9.2989, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0001891125866677612), (499999, 0.0), (380927, -4.990981101989746)]
([2, 2, 0], [2, 2, 7])
tensor(9.2928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00022693505161441863), (499999, 0.0), (380927, -5.116344451904297)]
([2, 2, 0], [2, 2, 7])

  2%|▏         | 17/725 [37:07<25:35:54, 130.16s/it]

test：0.0, test mean: 0.14285714285714285
([1, 2, 5], [1, 2, 5])
tensor(6.5099, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 18/725 [37:32<19:23:11, 98.71s/it] 

([0, 6, 2], [0, 6, 2])
tensor(5.1840, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 8.134311792673543e-05), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 19/725 [37:57<14:59:54, 76.48s/it]

([2, 0, 0], [2, 0, 1])
tensor(5.2713, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 20/725 [38:21<11:52:42, 60.66s/it]

([1, 0, 6], [1, 0, 6])
tensor(3.9795, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 21/725 [38:47<9:51:10, 50.38s/it] 

([7, 2, 8], [7, 2, 8])
tensor(7.4961, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.35890501737594604), (499999, 0.0), (499999, 0.0)]
([7, 2, 8], [7, 2, 8])
tensor(7.8779, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.579481363296509), (499999, 0.0), (499999, 0.0)]
([7, 2, 8], [7, 2, 8])
tensor(6.4313, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.433939933776855), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [7, 2, 8])
tensor(4.8270, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.438142776489258), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [7, 2, 8])
tensor(4.8247, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.444393157958984), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [7, 2, 8])
tensor(4.8230, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.437950134277344), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [7, 2, 8])
tensor(4.8225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.431502342224121), (499999, 0.0), (499999, 0.

  3%|▎         | 22/725 [43:12<22:24:24, 114.74s/it]

test：0.0, test mean: 0.125
([0, 1, 2], [0, 1, 2])
tensor(4.6303, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.021739032119512558), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(4.7758, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.0829687118530273), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(4.5914, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.7634683847427368), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(4.5809, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.6500070095062256), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(4.5791, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.5697832107543945), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(4.5779, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.5086404085159302), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(4.5773, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.456204

  3%|▎         | 23/725 [47:43<31:31:31, 161.67s/it]

test：1.0, test mean: 0.2222222222222222
([2, 8, 7], [2, 8, 7])
tensor(8.2312, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.000704711361322552), (380927, 0.2208685725927353)]
([2, 8, 7], [2, 8, 7])
tensor(8.1025, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.001409422722645104), (380927, -4.464676856994629)]
([2, 8, 7], [2, 8, 7])
tensor(6.8636, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.002114133909344673), (380927, 0.15409404039382935)]
([2, 8, 0], [2, 8, 7])
tensor(6.0447, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.002818845445290208), (380927, 0.135761559009552)]
([2, 8, 0], [2, 8, 7])
tensor(6.0438, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0035235555842518806), (380927, 0.10246241092681885)]
([2, 8, 0], [2, 8, 7])
tensor(6.0421, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.004228267818689346), (380927, 0.07814

  3%|▎         | 24/725 [52:25<38:30:26, 197.76s/it]

test：0.0, test mean: 0.2
([1, 7, 1], [1, 7, 1])
tensor(6.6382, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.3479250371456146), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(6.9162, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.6590749025344849), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(5.0024, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.5422016382217407), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(3.5010, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.8510600328445435), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(3.4662, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.1421563625335693), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(3.4614, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.3172667026519775), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(3.4585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927

  3%|▎         | 25/725 [56:59<42:54:40, 220.69s/it]

test：0.0, test mean: 0.18181818181818182
([1, 0, 2], [1, 0, 2])
tensor(7.8972, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▎         | 26/725 [57:25<31:29:12, 162.16s/it]

([5, 0, 2], [5, 0, 2])
tensor(6.8988, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▎         | 27/725 [57:50<23:28:37, 121.08s/it]

([7, 2, 0], [7, 2, 0])
tensor(8.2536, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.2212013602256775), (499999, 0.0), (479231, 0.08606286346912384)]
([7, 2, 0], [7, 2, 0])
tensor(8.6377, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.118375778198242), (499999, 0.0), (479231, 0.7392873764038086)]
([7, 2, 0], [7, 2, 0])
tensor(6.7754, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.2326769828796387), (499999, 0.0), (479231, 1.0346534252166748)]
([0, 2, 7], [7, 2, 0])
tensor(5.9568, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.688835382461548), (499999, 0.0), (479231, -0.9296754598617554)]
([0, 2, 0], [7, 2, 0])
tensor(5.2537, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.839052200317383), (499999, 0.0), (479231, -0.9225986003875732)]
([0, 2, 0], [7, 2, 0])
tensor(5.2472, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.969041109085083), (499999, 0.0), (479231, -0.9154729843139648)]
([0, 2, 0], [7, 2, 0])
tensor(5.2440, dev

  4%|▍         | 28/725 [1:02:26<32:28:13, 167.71s/it]

test：0.5, test mean: 0.20833333333333334
([2, 1, 7], [2, 1, 7])
tensor(5.5587, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▍         | 29/725 [1:02:53<24:13:02, 125.26s/it]

([2, 2, 2], [2, 2, 2])
tensor(12.1546, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▍         | 30/725 [1:03:19<18:28:44, 95.72s/it] 

([5, 1, 1], [5, 1, 1])
tensor(5.5636, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▍         | 31/725 [1:03:45<14:24:22, 74.73s/it]

([2, 2, 8], [2, 2, 8])
tensor(8.2564, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -1.5452096704393625e-05)]


  4%|▍         | 32/725 [1:04:10<11:30:05, 59.75s/it]

([1, 2, 0], [1, 2, 0])
tensor(5.0135, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 33/725 [1:04:34<9:23:36, 48.87s/it] 

([8, 1, 1], [8, 1, 1])
tensor(5.5179, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 34/725 [1:04:58<7:58:15, 41.53s/it]

([5, 1, 0], [5, 1, 0])
tensor(2.0715, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.43826431035995483), (499999, 0.0), (454655, -0.0011882721446454525)]
([0, 1, 0], [5, 1, 0])
tensor(1.6935, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.028852939605713), (499999, 0.0), (454655, 4.58638858795166)]
([0, 1, 0], [5, 1, 0])
tensor(1.2614, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.192559242248535), (499999, 0.0), (454655, 5.00778865814209)]
([0, 1, 0], [5, 1, 0])
tensor(1.2814, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.332658767700195), (499999, 0.0), (454655, 5.346301555633545)]
([0, 1, 0], [5, 1, 0])
tensor(1.2390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.446735382080078), (499999, 0.0), (454655, 5.375274658203125)]
([0, 1, 0], [5, 1, 0])
tensor(1.2380, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.547221660614014), (499999, 0.0), (454655, 5.399592876434326)]
([0, 1, 0], [5, 1, 0])
tensor(1.2372, device='cuda:0

  5%|▍         | 35/725 [1:09:18<20:33:12, 107.24s/it]

test：0.5, test mean: 0.23076923076923078
([1, 2, 2], [1, 2, 2])
tensor(9.2302, device='cuda:0', grad_fn=<NllLossBackward0>)


  5%|▍         | 36/725 [1:09:44<15:49:41, 82.70s/it] 

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([2, 2, 2], [2, 2, 2])
tensor(14.1467, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▌         | 37/725 [1:10:07<12:21:53, 64.70s/it]

([5, 0, 0], [5, 0, 0])
tensor(1.9253, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -0.01603703945875168)]
([5, 0, 0], [5, 0, 0])
tensor(2.1868, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -1.0597293376922607)]
([5, 0, 0], [5, 0, 0])
tensor(1.9208, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -1.2349649667739868)]
([5, 0, 0], [5, 0, 0])
tensor(2.1085, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -2.2974038124084473)]
([5, 0, 0], [5, 0, 0])
tensor(1.8924, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -2.34812331199646)]
([5, 0, 0], [5, 0, 0])
tensor(1.8908, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -2.3038320541381836)]
([5, 0, 0], [5, 0, 0])
tensor(1.8859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -2.33537

  5%|▌         | 38/725 [1:14:31<23:45:45, 124.52s/it]

test：1.0, test mean: 0.2857142857142857
([7, 3, 8], [7, 3, 8])
tensor(3.9488, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.20832374691963196), (499999, 0.0), (499999, 0.0)]
([7, 3, 8], [7, 3, 8])
tensor(3.6737, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.76973819732666), (499999, 0.0), (499999, 0.0)]
([7, 3, 8], [7, 3, 8])
tensor(2.8898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.2072944641113281), (499999, 0.0), (499999, 0.0)]
([0, 3, 8], [7, 3, 8])
tensor(2.1765, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.2866716384887695), (499999, 0.0), (499999, 0.0)]
([0, 3, 8], [7, 3, 8])
tensor(2.1729, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.3272969722747803), (499999, 0.0), (499999, 0.0)]
([0, 3, 8], [7, 3, 8])
tensor(2.1720, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.3637852668762207), (499999, 0.0), (499999, 0.0)]
([0, 3, 8], [7, 3, 8])
tensor(2.1712, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.3

  5%|▌         | 39/725 [1:18:38<30:43:32, 161.24s/it]

test：0.0, test mean: 0.26666666666666666
([6, 1, 2], [6, 1, 2])
tensor(8.3895, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 2.9883523893659003e-05)]


  6%|▌         | 40/725 [1:19:01<22:48:35, 119.88s/it]

([5, 8, 2], [8, 8, 2])
tensor(4.9938, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 41/725 [1:19:25<17:17:05, 90.97s/it] 

([1, 0, 0], [1, 0, 1])
tensor(2.3109, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 42/725 [1:19:49<13:28:04, 70.99s/it]

([1, 2, 5], [1, 2, 1])
tensor(9.5278, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 43/725 [1:20:14<10:49:38, 57.15s/it]

([1, 2, 0], [1, 2, 0])
tensor(5.3861, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 44/725 [1:20:37<8:54:15, 47.07s/it] 

([7, 0, 2], [7, 7, 2])
tensor(10.6383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21022143959999084), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 7, 2])
tensor(10.6387, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -11.156481742858887), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 7, 2])
tensor(8.9870, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.178177833557129), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 7, 2])
tensor(7.4199, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.822741508483887), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(6.9889, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.227829456329346), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(6.9185, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.103837490081787), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(6.8943, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.9158430099487305), (499999, 0.0), (4

  6%|▌         | 45/725 [1:24:43<20:09:46, 106.74s/it]

test：0.0, test mean: 0.25
([8, 8, 0], [8, 8, 0])
tensor(2.6029, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.002413438633084297)]
([8, 8, 1], [8, 8, 0])
tensor(2.9890, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.661821722984314)]
([8, 8, 0], [8, 8, 0])
tensor(2.6012, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.5995994210243225)]
([8, 8, 1], [8, 8, 0])
tensor(2.9242, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.328319549560547)]
([8, 8, 0], [8, 8, 0])
tensor(2.5980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.3045735359191895)]
([8, 8, 0], [8, 8, 0])
tensor(2.7279, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.237579822540283)]
([8, 8, 0], [8, 8, 0])
tensor(2.5971, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0

  6%|▋         | 46/725 [1:29:08<29:05:06, 154.21s/it]

test：1.0, test mean: 0.29411764705882354
([8, 5, 1], [8, 5, 1])
tensor(3.9006, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 3.8353802665369585e-05), (499999, 0.0)]


  6%|▋         | 47/725 [1:29:31<21:38:17, 114.89s/it]

([6, 1, 1], [6, 1, 1])
tensor(6.5309, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 48/725 [1:29:56<16:31:50, 87.90s/it] 

([8, 0, 5], [8, 0, 5])
tensor(2.1090, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -0.008899497799575329), (499999, 0.0)]
([8, 0, 5], [8, 0, 5])
tensor(2.1773, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -2.171104669570923), (499999, 0.0)]
([8, 0, 5], [8, 0, 5])
tensor(2.1091, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -2.1946728229522705), (499999, 0.0)]
([8, 0, 5], [8, 0, 5])
tensor(2.1660, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -3.8710737228393555), (499999, 0.0)]
([8, 0, 5], [8, 0, 5])
tensor(2.1049, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -3.7751340866088867), (499999, 0.0)]
([8, 0, 5], [8, 0, 5])
tensor(2.1027, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -3.8116238117218018), (499999, 0.0)]
([8, 0, 5], [8, 0, 5])
tensor(2.0992, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -3.8092260360717773),

  7%|▋         | 49/725 [1:34:09<25:46:11, 137.24s/it]

test：1.0, test mean: 0.3333333333333333
([3, 8, 7], [3, 8, 7])
tensor(5.7558, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.20992952585220337)]
([3, 8, 7], [3, 8, 7])
tensor(5.7325, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.859113693237305)]
([3, 8, 7], [3, 8, 7])
tensor(4.4040, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -1.5789194107055664)]
([3, 8, 0], [3, 8, 7])
tensor(3.2910, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -1.600721836090088)]
([3, 8, 0], [3, 8, 7])
tensor(3.2880, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -1.6412651538848877)]
([3, 8, 0], [3, 8, 7])
tensor(3.2856, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -1.6769464015960693)]
([3, 8, 0], [3, 8, 7])
tensor(3.2845, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999,

  7%|▋         | 50/725 [1:38:28<32:36:22, 173.90s/it]

test：0.0, test mean: 0.3157894736842105
([2, 1, 1], [2, 1, 1])
tensor(8.3552, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 51/725 [1:38:51<24:05:49, 128.71s/it]

([5, 2, 8], [5, 2, 8])
tensor(7.3810, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0004923061933368444), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 52/725 [1:39:15<18:09:15, 97.11s/it] 

([1, 0, 1], [1, 0, 1])
tensor(3.9209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 53/725 [1:39:39<14:02:18, 75.21s/it]

([1, 2, 1], [1, 2, 1])
tensor(7.5281, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 54/725 [1:40:02<11:05:31, 59.51s/it]

([1, 6, 8], [1, 6, 8])
tensor(4.3911, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 55/725 [1:40:25<9:02:39, 48.60s/it] 

([7, 2, 1], [7, 2, 1])
tensor(9.3215, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.209797203540802), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [7, 2, 1])
tensor(9.2634, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.427309513092041), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [7, 2, 1])
tensor(7.9556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.8306646347045898), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(6.8878, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.8815240859985352), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(6.8859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.9186561107635498), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(6.8843, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.9071260690689087), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(6.8834, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.8955979347229004), (499999, 0.0), (499999,

  8%|▊         | 56/725 [1:44:22<19:31:54, 105.10s/it]

test：0.0, test mean: 0.3
([2, 1, 5], [2, 1, 6])
tensor(7.9053, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 57/725 [1:44:46<14:58:54, 80.74s/it] 

([2, 7, 1], [2, 7, 1])
tensor(8.3934, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.3587718605995178), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(8.9225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -2.355659008026123), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(7.5108, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 4.460549831390381), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(6.2275, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 4.113632678985596), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(6.2254, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 3.9401142597198486), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(6.2246, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 3.7714993953704834), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(6.2243, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 3.6111562252044678), (499999,

  8%|▊         | 58/725 [1:48:52<24:08:54, 130.34s/it]

test：0.0, test mean: 0.2857142857142857
([2, 1, 8], [2, 1, 8])
tensor(7.4402, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, 0.11236236989498138)]
([2, 1, 1], [2, 1, 8])
tensor(7.7886, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, 4.290914535522461)]
([2, 1, 0], [2, 1, 8])
tensor(6.9580, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, 4.375716209411621)]
([2, 1, 0], [2, 1, 8])
tensor(6.9532, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, 4.423269271850586)]
([2, 1, 0], [2, 1, 8])
tensor(6.9514, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, 4.398327827453613)]
([2, 1, 0], [2, 1, 8])
tensor(6.9498, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, 4.36875581741333)]
([2, 1, 0], [2, 1, 8])
tensor(6.9491, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (4

  8%|▊         | 59/725 [1:52:54<30:17:37, 163.75s/it]

test：0.0, test mean: 0.2727272727272727
([5, 7, 2], [5, 7, 2])
tensor(7.5494, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 60/725 [1:53:16<22:26:37, 121.50s/it]

([5, 2, 2], [5, 2, 2])
tensor(8.1739, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 61/725 [1:53:39<16:55:37, 91.77s/it] 

([1, 1, 5], [1, 1, 8])
tensor(6.1988, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▊         | 62/725 [1:54:01<13:03:40, 70.92s/it]

([7, 7, 1], [7, 7, 1])
tensor(7.5045, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.34788766503334045), (380927, 0.21011866629123688), (499999, 0.0)]
([7, 7, 1], [7, 7, 1])
tensor(7.7415, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.059464454650879), (380927, -6.3478922843933105), (499999, 0.0)]
([7, 7, 1], [7, 7, 1])
tensor(4.7677, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 3.6162562370300293), (380927, -1.0938761234283447), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(1.5181, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.6931517124176025), (380927, -1.82974112033844), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(1.4343, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.452597141265869), (380927, -2.190276861190796), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(1.4216, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.292278289794922), (380927, -2.419862747192383), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(1.4167, device

  9%|▊         | 63/725 [1:57:56<22:05:51, 120.17s/it]

test：0.0, test mean: 0.2608695652173913
([5, 3, 2], [5, 3, 2])
tensor(7.8974, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 64/725 [1:58:20<16:44:04, 91.14s/it] 

([2, 5, 2], [2, 5, 2])
tensor(9.0269, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 65/725 [1:58:43<12:59:03, 70.82s/it]

([6, 2, 2], [6, 2, 2])
tensor(12.3020, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 66/725 [1:59:08<10:27:21, 57.12s/it]

([5, 2, 2], [2, 2, 2])
tensor(9.6752, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 67/725 [1:59:31<8:35:11, 46.98s/it] 

([2, 8, 0], [2, 8, 0])
tensor(6.6855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 68/725 [1:59:55<7:17:44, 39.98s/it]

([5, 2, 0], [0, 2, 1])
tensor(5.5799, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|▉         | 69/725 [2:00:18<6:20:32, 34.81s/it]

([6, 2, 2], [6, 2, 2])
tensor(9.8868, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|▉         | 70/725 [2:00:43<5:47:46, 31.86s/it]

([7, 1, 5], [7, 1, 5])
tensor(6.7527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21018296480178833), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 5])
tensor(6.8933, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.092253684997559), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 5])
tensor(5.3172, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.8944091796875), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(3.9244, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.792412757873535), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(3.8986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.001131057739258), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(3.8917, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.104905605316162), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(3.8879, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.188445568084717), (499999, 0.0), (499999,

 10%|▉         | 71/725 [2:05:17<18:59:16, 104.52s/it]

test：0.0, test mean: 0.25
([2, 5, 7], [2, 3, 7])
tensor(8.9171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.00038736092392355204), (380927, 0.2102203667163849)]
([2, 5, 7], [2, 3, 7])
tensor(8.8716, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0007747218478471041), (380927, -9.797582626342773)]
([2, 5, 7], [2, 3, 7])
tensor(7.2040, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0011620826553553343), (380927, -4.08682918548584)]
([2, 5, 7], [2, 3, 7])
tensor(5.6554, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0015494436956942081), (380927, -1.2632720470428467)]
([2, 5, 0], [2, 3, 7])
tensor(5.2659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0019368045032024384), (380927, -1.755258321762085)]
([2, 5, 0], [2, 3, 7])
tensor(5.2250, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0023241653107106686), (380927, -1.8662594556808472)]


 10%|▉         | 72/725 [2:10:02<28:46:54, 158.67s/it]

test：0.0, test mean: 0.24
([3, 8, 6], [3, 8, 6])
tensor(5.0458, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|█         | 73/725 [2:10:25<21:21:01, 117.89s/it]

([2, 6, 0], [2, 6, 0])
tensor(5.6000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -0.008035179227590561)]
([2, 6, 0], [2, 6, 0])
tensor(5.6251, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -0.8921264410018921)]
([2, 6, 0], [2, 6, 0])
tensor(5.5997, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -0.9301724433898926)]
([2, 6, 0], [2, 6, 0])
tensor(5.6193, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -1.6091359853744507)]
([2, 6, 0], [2, 6, 0])
tensor(5.5971, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -1.5761566162109375)]
([2, 6, 0], [2, 6, 0])
tensor(5.5960, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -1.594560146331787)]
([2, 6, 0], [2, 6, 0])
tensor(5.5953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -1.609

 10%|█         | 74/725 [2:14:31<28:18:27, 156.54s/it]

test：1.0, test mean: 0.2692307692307692
([5, 2, 2], [5, 2, 2])
tensor(8.4869, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|█         | 75/725 [2:14:55<21:04:28, 116.72s/it]

([2, 8, 1], [2, 8, 1])
tensor(8.0752, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|█         | 76/725 [2:15:20<16:04:43, 89.19s/it] 

([2, 8, 5], [2, 8, 5])
tensor(8.9042, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 77/725 [2:15:44<12:32:49, 69.71s/it]

([0, 2, 8], [0, 2, 8])
tensor(6.4116, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 78/725 [2:16:09<10:05:06, 56.12s/it]

([5, 7, 1], [3, 7, 1])
tensor(6.1905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, -0.07825746387243271), (380927, 0.21020719408988953), (499999, 0.0)]
([0, 7, 1], [3, 7, 1])
tensor(5.8898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 3.601917266845703), (380927, -9.07332992553711), (499999, 0.0)]
([0, 7, 1], [3, 7, 1])
tensor(4.1914, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 4.356860160827637), (380927, -4.780453681945801), (499999, 0.0)]
([0, 0, 1], [3, 7, 1])
tensor(2.5924, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 4.364631652832031), (380927, -4.382702827453613), (499999, 0.0)]
([0, 0, 1], [3, 7, 1])
tensor(2.5253, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 4.3674845695495605), (380927, -4.239556312561035), (499999, 0.0)]
([0, 0, 1], [3, 7, 1])
tensor(2.5197, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 4.398441314697266), (380927, -4.185174465179443), (499999, 0.0)]
([0, 0, 1], [3, 7, 1])
tensor(2.5176, device='cu

 11%|█         | 79/725 [2:20:48<22:04:22, 123.01s/it]

test：0.0, test mean: 0.25925925925925924
([5, 2, 8], [5, 2, 5])
tensor(4.8818, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 80/725 [2:21:14<16:51:11, 94.06s/it] 

([1, 2, 2], [1, 2, 2])
tensor(10.0474, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 81/725 [2:21:40<13:09:25, 73.55s/it]

([2, 0, 2], [2, 0, 2])
tensor(6.6357, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█▏        | 82/725 [2:22:06<10:36:39, 59.41s/it]

([5, 5, 2], [5, 1, 2])
tensor(7.2353, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0030708620324730873), (499999, -0.008355113677680492), (499999, 0.0)]


 11%|█▏        | 83/725 [2:22:33<8:50:03, 49.54s/it] 

([2, 3, 1], [2, 3, 1])
tensor(7.2088, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 84/725 [2:22:59<7:32:18, 42.34s/it]

([7, 2, 5], [7, 2, 3])
tensor(8.4060, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.20912925899028778), (499999, 0.0), (499999, -0.010026199743151665)]
([7, 2, 5], [7, 2, 3])
tensor(8.1402, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.445448398590088), (499999, 0.0), (499999, -0.02005239948630333)]
([7, 2, 5], [7, 2, 3])
tensor(7.4129, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.5667195320129395), (499999, 0.0), (499999, -0.030078593641519547)]
([0, 2, 5], [7, 2, 3])
tensor(6.5580, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.7911882400512695), (499999, 0.0), (499999, -0.04010479897260666)]
([0, 2, 5], [7, 2, 3])
tensor(6.5559, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.9163498878479), (499999, 0.0), (499999, -0.050130993127822876)]
([0, 2, 5], [7, 2, 3])
tensor(6.5547, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 6.027829647064209), (499999, 0.0), (499999, -0.06015719473361969)]
([0, 2, 5], [7, 2, 3])
tensor(6.55

 12%|█▏        | 85/725 [2:27:06<18:28:38, 103.94s/it]

test：0.0, test mean: 0.25
([0, 1, 1], [0, 1, 1])
tensor(3.0164, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 86/725 [2:27:31<14:13:34, 80.15s/it] 

([1, 2, 0], [1, 2, 0])
tensor(6.6065, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 87/725 [2:27:54<11:09:34, 62.97s/it]

([5, 7, 1], [0, 7, 1])
tensor(6.1594, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0036930739879608154), (380927, 0.22122889757156372), (499999, 0.0)]
([5, 7, 1], [0, 7, 1])
tensor(6.2584, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.007386147975921631), (380927, -7.87950325012207), (499999, 0.0)]
([5, 7, 1], [0, 7, 1])
tensor(4.5636, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.011079220101237297), (380927, -2.8534722328186035), (499999, 0.0)]
([5, 7, 1], [0, 7, 1])
tensor(2.9747, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.014772295951843262), (380927, 1.134761095046997), (499999, 0.0)]
([5, 0, 1], [0, 7, 1])
tensor(2.6080, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.018465369939804077), (380927, 0.02239847183227539), (499999, 0.0)]
([5, 0, 1], [0, 7, 1])
tensor(2.5568, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.022158440202474594), (380927, -1.0209332704544067), (499999, 0.0)]
([5, 0, 1], [0, 7, 1])
tensor(2

 12%|█▏        | 88/725 [2:32:10<21:22:38, 120.81s/it]

test：0.0, test mean: 0.2413793103448276
([0, 2, 2], [0, 2, 2])
tensor(7.6883, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.043980538845062256), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [0, 2, 2])
tensor(8.0946, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.006554901599884033), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.4684, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 1.2218897342681885), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.4351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 1.6830172538757324), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.4308, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 2.010836124420166), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.4280, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 2.271709442138672), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.4268, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231,

 12%|█▏        | 89/725 [2:36:33<28:55:47, 163.75s/it]

test：1.0, test mean: 0.26666666666666666
([5, 1, 1], [6, 1, 1])
tensor(6.1324, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 90/725 [2:37:00<21:38:02, 122.65s/it]

([1, 5, 5], [1, 5, 8])
tensor(4.1683, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.009492152370512486)]


 13%|█▎        | 91/725 [2:37:27<16:31:01, 93.79s/it] 

([7, 1, 0], [7, 1, 0])
tensor(5.1149, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21019183099269867), (499999, 0.0), (499999, 0.0)]
([7, 1, 0], [7, 1, 0])
tensor(5.1076, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.092045783996582), (499999, 0.0), (499999, 0.0)]
([7, 1, 0], [7, 1, 0])
tensor(3.5413, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.033146381378174), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [7, 1, 0])
tensor(2.2459, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.88291597366333), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [7, 1, 0])
tensor(2.1768, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.232389450073242), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [7, 1, 0])
tensor(2.1568, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.4239397048950195), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [7, 1, 0])
tensor(2.1501, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.617565155029297), (499999, 0.0), (49999

 13%|█▎        | 92/725 [2:41:50<25:26:30, 144.69s/it]

test：0.0, test mean: 0.25806451612903225
([8, 2, 5], [8, 2, 5])
tensor(9.4091, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 93/725 [2:42:14<19:03:40, 108.58s/it]

([7, 0, 0], [7, 0, 0])
tensor(1.2615, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 94/725 [2:42:37<14:31:37, 82.88s/it] 

([2, 7, 0], [2, 7, 7])
tensor(5.6922, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 95/725 [2:43:00<11:21:55, 64.95s/it]

([0, 0, 2], [0, 0, 2])
tensor(3.0465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.0007948391139507294), (499999, 0.0)]
([0, 7, 2], [0, 0, 2])
tensor(3.4000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -2.7800495624542236), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(3.0117, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -3.222308397293091), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(3.0496, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -4.007287979125977), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(2.9612, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -4.069364547729492), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(2.9585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -4.148383617401123), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(2.9564, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -4.255991458892822), (49

 13%|█▎        | 96/725 [2:47:07<20:50:28, 119.28s/it]

test：1.0, test mean: 0.28125
([5, 2, 1], [5, 2, 1])
tensor(5.7111, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 97/725 [2:47:30<15:46:12, 90.40s/it] 

([1, 2, 2], [1, 2, 2])
tensor(8.5726, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.002906334586441517), (499999, 0.0), (499999, -0.00017367035616189241)]


 14%|█▎        | 98/725 [2:47:53<12:13:46, 70.22s/it]

([2, 7, 1], [2, 7, 1])
tensor(10.9812, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21021994948387146), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(11.0682, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.797616958618164), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(9.4261, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.082640171051025), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(7.8612, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.662291407585144), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.4217, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.3381530046463013), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.3691, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.646578311920166), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.3471, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.3514132499694824), (4

 14%|█▎        | 99/725 [2:51:55<21:10:36, 121.78s/it]

test：0.0, test mean: 0.2727272727272727
([7, 2, 0], [7, 2, 0])
tensor(6.2638, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21020299196243286), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(6.1650, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.092418670654297), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(4.5940, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.881452560424805), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(3.1242, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.6029863357543945), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(3.0861, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.480883598327637), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(3.0796, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.433018684387207), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(3.0773, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -

 14%|█▍        | 100/725 [2:55:48<26:56:09, 155.15s/it]

test：0.0, test mean: 0.2647058823529412
([5, 8, 0], [0, 8, 0])
tensor(1.9627, device='cuda:0', grad_fn=<NllLossBackward0>)


 14%|█▍        | 101/725 [2:56:11<20:02:16, 115.60s/it]

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([2, 0, 5], [2, 0, 3])
tensor(7.9934, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.005944561213254929), (499999, 0.0)]
([2, 0, 5], [2, 0, 3])
tensor(8.0032, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.35849714279174805), (499999, 0.0)]
([2, 0, 5], [2, 0, 3])
tensor(7.9898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.4294719398021698), (499999, 0.0)]
([2, 0, 5], [2, 0, 3])
tensor(7.9903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.6199008226394653), (499999, 0.0)]
([2, 0, 5], [2, 0, 3])
tensor(7.9818, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.6525689959526062), (499999, 0.0)]
([2, 0, 5], [2, 0, 3])
tensor(7.9816, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.7066066265106201), (499999, 0.0)]
([2, 0, 5], [2, 0, 3])
tensor(7.9801, device='cuda:0', grad_fn=<NllLossBackward0>

 14%|█▍        | 102/725 [3:00:14<26:37:01, 153.81s/it]

test：1.0, test mean: 0.2857142857142857
([0, 2, 1], [0, 2, 1])
tensor(7.0134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 14%|█▍        | 103/725 [3:00:36<19:43:52, 114.20s/it]

([7, 2, 2], [7, 2, 2])
tensor(11.7385, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21019630134105682), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(11.7867, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.092459678649902), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(10.2178, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.0101776123046875), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(8.8356, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.766148328781128), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(8.7300, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.7500176429748535), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(8.7097, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.804826259613037), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(8.7000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.906069040298462), (499999, 0.0), (

 14%|█▍        | 104/725 [3:04:32<26:01:19, 150.85s/it]

test：0.0, test mean: 0.2777777777777778
([2, 1, 0], [2, 1, 0])
tensor(7.5894, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0027420492842793465), (499999, 0.0)]


 14%|█▍        | 105/725 [3:04:54<19:20:11, 112.28s/it]

([7, 0, 2], [7, 0, 2])
tensor(5.8193, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▍        | 106/725 [3:05:17<14:41:19, 85.43s/it] 

([1, 2, 8], [1, 2, 8])
tensor(7.3653, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.0006719997036270797)]


 15%|█▍        | 107/725 [3:05:41<11:28:20, 66.83s/it]

([7, 2, 1], [0, 2, 1])
tensor(5.6920, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▍        | 108/725 [3:06:03<9:10:31, 53.54s/it] 

([1, 7, 5], [1, 7, 5])
tensor(7.6749, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.3589414656162262), (499999, 0.0)]
([1, 7, 5], [1, 7, 5])
tensor(8.0401, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.3430659770965576), (499999, 0.0)]
([1, 7, 5], [1, 7, 5])
tensor(6.1566, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.142195224761963), (499999, 0.0)]
([1, 7, 5], [1, 7, 5])
tensor(4.2967, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.203095436096191), (499999, 0.0)]
([1, 0, 5], [1, 7, 5])
tensor(4.0373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 7.002910137176514), (499999, 0.0)]
([1, 0, 5], [1, 7, 5])
tensor(3.9811, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 8.429134368896484), (499999, 0.0)]
([1, 0, 5], [1, 7, 5])
tensor(3.9606, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 9.408123016357422), (499999, 0

 15%|█▌        | 109/725 [3:10:07<18:54:33, 110.51s/it]

test：0.0, test mean: 0.2702702702702703
([1, 8, 7], [1, 8, 7])
tensor(6.7825, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.22122922539710999)]
([1, 8, 7], [1, 8, 7])
tensor(6.8445, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.601762294769287)]
([1, 8, 7], [1, 8, 7])
tensor(5.1410, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -3.1043262481689453)]
([1, 8, 7], [1, 8, 7])
tensor(3.5577, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.7169275283813477)]
([1, 8, 0], [1, 8, 7])
tensor(3.2062, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -2.819959878921509)]
([1, 8, 0], [1, 8, 7])
tensor(3.1477, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -2.759794235229492)]
([1, 8, 0], [1, 8, 7])
tensor(3.1278, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 15%|█▌        | 110/725 [3:14:25<26:27:30, 154.88s/it]

test：0.0, test mean: 0.2631578947368421
([1, 2, 2], [1, 2, 2])
tensor(7.7285, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▌        | 111/725 [3:14:48<19:38:55, 115.20s/it]

([2, 2, 0], [2, 2, 0])
tensor(8.7694, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▌        | 112/725 [3:15:10<14:53:45, 87.48s/it] 

([8, 2, 0], [8, 2, 0])
tensor(5.7146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0013593960320577025), (499999, 0.0)]


 16%|█▌        | 113/725 [3:15:34<11:35:07, 68.15s/it]

([5, 1, 5], [5, 1, 5])
tensor(5.6503, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 16%|█▌        | 114/725 [3:15:56<9:14:00, 54.40s/it] 

([5, 2, 0], [5, 2, 0])
tensor(5.2633, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0028719687834382057), (499999, 0.0), (430079, 0.09272000193595886)]
([5, 2, 0], [5, 2, 0])
tensor(5.4441, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0057439375668764114), (499999, 0.0), (430079, 0.9895918965339661)]
([5, 2, 0], [5, 2, 0])
tensor(5.2480, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.008615905418992043), (499999, 0.0), (430079, 1.9182604551315308)]
([5, 2, 7], [5, 2, 0])
tensor(5.7729, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.011487875133752823), (499999, 0.0), (430079, 1.5561248064041138)]
([5, 2, 0], [5, 2, 0])
tensor(5.2207, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.014359842985868454), (499999, 0.0), (430079, 1.5560336112976074)]
([5, 2, 0], [5, 2, 0])
tensor(5.2207, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.017231812700629234), (499999, 0.0), (430079, 1.5559426546096802)]
([5, 2, 0], [5, 2, 0])
te

 16%|█▌        | 115/725 [3:20:07<19:11:49, 113.29s/it]

test：1.0, test mean: 0.28205128205128205
([5, 0, 6], [8, 0, 6])
tensor(2.7883, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.001056453213095665), (499999, 0.0)]
([5, 0, 6], [8, 0, 6])
tensor(2.7874, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.01461155153810978), (499999, 0.0)]
([5, 0, 6], [8, 0, 6])
tensor(2.7872, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.024788152426481247), (499999, 0.0)]
([5, 0, 6], [8, 0, 6])
tensor(2.7846, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.037203047424554825), (499999, 0.0)]
([5, 0, 6], [8, 0, 6])
tensor(2.7833, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.04576968401670456), (499999, 0.0)]
([5, 0, 6], [8, 0, 6])
tensor(2.7829, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.05935771018266678), (499999, 0.0)]
([5, 0, 6], [8, 0, 6])
tensor(2.7826, device='cuda:0', grad_fn=<NllLossBackward0

 16%|█▌        | 116/725 [3:24:19<26:13:46, 155.05s/it]

test：1.0, test mean: 0.3
([8, 7, 1], [8, 7, 1])
tensor(4.3146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 16%|█▌        | 117/725 [3:24:44<19:34:24, 115.90s/it]

([5, 1, 8], [5, 1, 0])
tensor(3.7689, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0013250585179775953), (499999, 0.0)]


 16%|█▋        | 118/725 [3:25:07<14:52:09, 88.19s/it] 

([7, 3, 2], [7, 3, 2])
tensor(9.5268, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.1730220913887024), (499999, 0.0), (499999, 0.0)]
([7, 3, 2], [7, 3, 2])
tensor(9.4598, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -14.563095092773438), (499999, 0.0), (499999, 0.0)]
([7, 3, 2], [7, 3, 2])
tensor(8.3021, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.797116279602051), (499999, 0.0), (499999, 0.0)]
([7, 3, 2], [7, 3, 2])
tensor(6.8413, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.011085033416748), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [7, 3, 2])
tensor(6.5324, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.889669418334961), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [7, 3, 2])
tensor(6.5128, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.441643714904785), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [7, 3, 2])
tensor(6.5043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.123971462249756), (499999, 0.0), (49999

 16%|█▋        | 119/725 [3:29:16<22:57:35, 136.40s/it]

test：0.0, test mean: 0.2926829268292683
([1, 3, 2], [1, 3, 2])
tensor(5.9634, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -0.748542308807373), (499999, 0.0)]
([1, 3, 2], [1, 3, 2])
tensor(5.9055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -2.405029773712158), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(5.0573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -1.3352782726287842), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(4.9368, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -1.746164083480835), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(4.8995, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -1.7591248750686646), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(4.8866, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -1.8138898611068726), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(4.8851, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 17%|█▋        | 120/725 [3:33:28<28:44:11, 170.99s/it]

test：0.0, test mean: 0.2857142857142857
([2, 1, 2], [2, 1, 2])
tensor(8.1390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 121/725 [3:33:51<21:14:00, 126.56s/it]

([0, 2, 5], [0, 2, 1])
tensor(5.0113, device='cuda:0', grad_fn=<NllLossBackward0>)


 17%|█▋        | 122/725 [3:34:13<15:58:38, 95.39s/it] 

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([8, 5, 1], [8, 1, 1])
tensor(4.2077, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 123/725 [3:34:38<12:24:15, 74.18s/it]

([7, 2, 6], [7, 2, 6])
tensor(8.9318, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.3589419722557068), (499999, 0.0), (499999, 0.0)]
([7, 2, 6], [7, 2, 6])
tensor(9.2060, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.3430649042129517), (499999, 0.0), (499999, 0.0)]
([7, 2, 6], [7, 2, 6])
tensor(7.3771, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 6.142421722412109), (499999, 0.0), (499999, 0.0)]
([7, 2, 6], [7, 2, 6])
tensor(5.5092, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.202284812927246), (499999, 0.0), (499999, 0.0)]
([0, 2, 6], [7, 2, 6])
tensor(5.2450, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 7.287539958953857), (499999, 0.0), (499999, 0.0)]
([0, 2, 6], [7, 2, 6])
tensor(5.1905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.849128723144531), (499999, 0.0), (499999, 0.0)]
([0, 2, 6], [7, 2, 6])
tensor(5.1702, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 9.645404815673828), (499999, 0.0), (499999, 0

 17%|█▋        | 124/725 [3:38:35<20:31:32, 122.95s/it]

test：0.0, test mean: 0.27906976744186046
([8, 7, 8], [8, 7, 8])
tensor(5.3960, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21019040048122406), (499999, 0.0)]
([8, 7, 8], [8, 7, 8])
tensor(5.3814, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.091961860656738), (499999, 0.0)]
([8, 7, 8], [8, 7, 8])
tensor(3.8251, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.901113986968994), (499999, 0.0)]
([8, 0, 8], [8, 7, 8])
tensor(2.4691, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.817356109619141), (499999, 0.0)]
([8, 0, 8], [8, 7, 8])
tensor(2.4437, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.037919521331787), (499999, 0.0)]
([8, 0, 8], [8, 7, 8])
tensor(2.4377, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.179906845092773), (499999, 0.0)]
([8, 0, 8], [8, 7, 8])
tensor(2.4344, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0

 17%|█▋        | 125/725 [3:43:08<28:00:08, 168.01s/it]

test：0.0, test mean: 0.2727272727272727
([2, 7, 1], [2, 7, 1])
tensor(10.5732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.35891613364219666), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(10.9721, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.6801073551177979), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(9.2742, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.871626853942871), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.5507, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.9637603759765625), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.5028, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.145809173583984), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.4771, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.256089210510254), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.4669, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0

 17%|█▋        | 126/725 [3:47:43<33:19:37, 200.30s/it]

test：0.0, test mean: 0.26666666666666666
([0, 2, 5], [0, 2, 8])
tensor(6.4872, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.001854530069977045), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [0, 2, 8])
tensor(6.5473, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 1.3818552494049072), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [0, 2, 8])
tensor(6.4872, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 1.3533494472503662), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [0, 2, 8])
tensor(6.5362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 2.776010513305664), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [0, 2, 8])
tensor(6.4823, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 2.8264312744140625), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [0, 2, 8])
tensor(6.4820, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 2.8225531578063965), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [0, 2, 8])
tensor(6.4815, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 

 18%|█▊        | 127/725 [3:52:04<36:17:34, 218.49s/it]

test：1.0, test mean: 0.2826086956521739
([2, 1, 0], [2, 1, 0])
tensor(7.2976, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 18%|█▊        | 128/725 [3:52:28<26:32:09, 160.02s/it]

([2, 1, 2], [2, 1, 2])
tensor(9.0004, device='cuda:0', grad_fn=<NllLossBackward0>)


Process Process-8:
Process Process-4:
Process ForkPoolWorker-1644:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 261, in _bootstrap
    util._exit_function()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 261, in _bootstrap
    util._exit_function()
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/util.py", line 296, in _exit_function
    _run_finalizers(0)
  File "/usr/lib/python3.6/multiprocessing/util.py", line 262, in _run_finalizers
    finalizer()
  File "/usr/lib/python3.6/multiprocessing/util.py", line 295, in _exit_function
    debug('running all "atexit" finalizers with priority >= 0')
  File "/usr/lib/python3.6/multiprocessing/util.py", line 48, in debug
    def debug(msg, *args):
KeyboardInterrupt
  File "/usr/lib/python3.6/multiprocessing/util.py", line 188, in __call__
    self._kwargs = self._key = None
KeyboardInterrupt
Process Process-13:
Trac

RuntimeError: DataLoader worker (pid 8013) exited unexpectedly with exit code 1. Details are lost due to multiprocessing. Rerunning with num_workers=0 may give better error trace.

In [14]:
if __name__=='__main__':
    model.eval()
    acc = []
    preds = []
    labels = []
    for step, (batch_data,batch_label) in enumerate(tqdm(validloader)):
        batch_data = batch_data.cuda() if CUDA else batch_data
        batch_label = batch_label.cuda() if CUDA else batch_label
        batch_label = batch_label.squeeze() - 1
        fake_label = torch.zeros_like(batch_label)
        pred,temp,check = model(batch_data,ce_loss,fake_label,batch_label)
        if sum(check)/len(check) == LEAVE_BIT_NUMBER-1: continue
            
        pred = np.argmax(pred,1)
        batch_label = batch_label.cpu().data.numpy()

        preds.extend(pred.tolist())
        labels.extend(batch_label.tolist())

        count = total = 0
        for c,ans in zip(check,batch_label == pred):
            if c != LEAVE_BIT_NUMBER-1:
                count += ans
                total += 1
        acc.append(count/total)
        print(f"test：{count/total}, test mean: {np.mean(acc)}")

  0%|          | 0/725 [00:00<?, ?it/s]

([0, 0, 0], [0, 0, 0])
tensor(0.3624, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0005131479119881988), (499999, 0.0), (499999, 0.0)]


  0%|          | 1/725 [00:25<5:10:30, 25.73s/it]

([2, 8, 0], [2, 8, 0])
tensor(6.7551, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  0%|          | 2/725 [00:51<5:07:17, 25.50s/it]

([5, 0, 2], [8, 0, 2])
tensor(6.4033, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  0%|          | 3/725 [01:19<5:24:34, 26.97s/it]

([2, 5, 7], [2, 5, 7])
tensor(7.1229, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.21020284295082092)]
([2, 5, 7], [2, 5, 7])
tensor(7.1188, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -10.451231956481934)]
([2, 5, 7], [2, 5, 7])
tensor(5.6265, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.3716301918029785)]
([2, 5, 0], [2, 5, 7])
tensor(4.1749, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.014133453369141)]
([2, 5, 0], [2, 5, 7])
tensor(4.0646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.305477619171143)]
([2, 5, 0], [2, 5, 7])
tensor(4.0450, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.328861713409424)]
([2, 5, 0], [2, 5, 7])
tensor(4.0367, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.3579969

  1%|          | 4/725 [06:05<25:50:32, 129.03s/it]

test：0.0, test mean: 0.0
([3, 2, 2], [3, 2, 2])
tensor(9.8142, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 5/725 [06:29<18:14:59, 91.25s/it] 

([8, 1, 1], [8, 1, 1])
tensor(6.8578, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.028053686022758484), (499999, 0.020961564034223557), (499999, 0.0)]


  1%|          | 6/725 [06:55<13:48:36, 69.15s/it]

([2, 2, 5], [2, 2, 1])
tensor(11.4518, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 7/725 [07:23<11:05:20, 55.60s/it]

([5, 0, 5], [5, 0, 5])
tensor(2.1855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 8/725 [07:50<9:13:50, 46.35s/it] 

([5, 7, 0], [5, 7, 0])
tensor(4.3369, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21020665764808655), (499999, 0.0)]
([5, 7, 0], [5, 7, 0])
tensor(4.2267, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.073371887207031), (499999, 0.0)]
([5, 7, 0], [5, 7, 0])
tensor(2.6429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.781552314758301), (499999, 0.0)]
([5, 0, 0], [5, 7, 0])
tensor(1.1216, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.212571144104004), (499999, 0.0)]
([5, 0, 0], [5, 7, 0])
tensor(1.0550, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.014043807983398), (499999, 0.0)]
([5, 0, 0], [5, 7, 0])
tensor(1.0507, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.014980316162109), (499999, 0.0)]
([5, 0, 0], [5, 7, 0])
tensor(1.0487, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.01424503326416), (499999

  1%|          | 9/725 [12:13<22:41:15, 114.07s/it]

test：0.0, test mean: 0.0
([7, 0, 1], [7, 0, 1])
tensor(4.2242, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.2082810252904892), (499999, 0.0), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(4.3099, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -5.286792278289795), (499999, 0.0), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(3.0155, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.9929261207580566), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(2.5707, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.9134905338287354), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(2.5699, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.8972530364990234), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(2.5696, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.8872714042663574), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(2.5694, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.87555336952

  1%|▏         | 10/725 [16:36<31:47:24, 160.06s/it]

test：0.0, test mean: 0.0
([8, 0, 8], [8, 0, 8])
tensor(3.6185, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 11/725 [17:01<23:36:11, 119.01s/it]

([7, 5, 1], [7, 5, 1])
tensor(6.7009, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21017253398895264), (499999, 0.0), (499999, 0.0020400891080498695)]
([7, 5, 1], [7, 5, 1])
tensor(6.7706, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -8.294473648071289), (499999, 0.0), (499999, 0.004080178216099739)]
([7, 5, 1], [7, 5, 1])
tensor(5.2437, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.919335126876831), (499999, 0.0), (499999, 0.006120266392827034)]
([0, 5, 1], [7, 5, 1])
tensor(3.6794, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.1363377571105957), (499999, 0.0), (499999, 0.008160356432199478)]
([0, 5, 1], [7, 5, 1])
tensor(3.6269, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.7873668670654297), (499999, 0.0), (499999, 0.010200446471571922)]
([0, 5, 1], [7, 5, 1])
tensor(3.6163, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.5957481861114502), (499999, 0.0), (499999, 0.012240531854331493)]
([0, 5, 1], [7, 5, 1])
tensor

  2%|▏         | 12/725 [21:20<32:00:02, 161.57s/it]

test：0.0, test mean: 0.0
([0, 1, 2], [1, 1, 2])
tensor(4.9293, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.014858548529446125), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [1, 1, 2])
tensor(5.1290, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 3.323582410812378), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [1, 1, 2])
tensor(4.9012, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 3.688295841217041), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [1, 1, 2])
tensor(4.9928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 6.687409400939941), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [1, 1, 2])
tensor(4.8849, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 6.737319469451904), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [1, 1, 2])
tensor(4.8847, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 6.781365394592285), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [1, 1, 2])
tensor(4.8846, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 6.844729423522949), (

  2%|▏         | 13/725 [25:45<38:06:27, 192.68s/it]

test：0.0, test mean: 0.0
([1, 2, 2], [1, 2, 2])
tensor(8.5019, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 14/725 [26:08<27:57:38, 141.57s/it]

([7, 2, 1], [7, 2, 1])
tensor(6.9937, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.34788766503334045), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [7, 2, 1])
tensor(7.3943, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.059464454650879), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [7, 2, 1])
tensor(5.8504, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 3.6162562370300293), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(4.0734, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.6931517124176025), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(4.0563, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.452597141265869), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(4.0543, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.292278289794922), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(4.0525, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.21600079536438), (499999, 0.0), (499999, 

  2%|▏         | 15/725 [30:41<35:44:22, 181.22s/it]

test：0.0, test mean: 0.0
([2, 2, 8], [2, 2, 8])
tensor(10.9203, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 16/725 [31:07<26:28:38, 134.44s/it]

([1, 2, 1], [1, 2, 1])
tensor(8.8514, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 17/725 [31:36<20:10:36, 102.59s/it]

([0, 0, 1], [0, 0, 1])
tensor(1.2355, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 18/725 [31:59<15:28:10, 78.77s/it] 

([0, 3, 8], [0, 3, 8])
tensor(1.6809, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.7021147012710571), (499999, 0.0)]
([0, 3, 8], [0, 3, 8])
tensor(1.7054, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -2.719336748123169), (499999, 0.0)]
([0, 3, 8], [0, 3, 8])
tensor(1.6862, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -4.73132848739624), (499999, 0.0)]
([0, 3, 8], [0, 3, 8])
tensor(1.5570, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -5.057755470275879), (499999, 0.0)]
([0, 0, 8], [0, 3, 8])
tensor(0.9032, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -6.123366355895996), (499999, 0.0)]
([0, 0, 8], [0, 3, 8])
tensor(0.6587, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -4.576568603515625), (499999, 0.0)]
([0, 0, 8], [0, 3, 8])
tensor(0.5823, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -4.533809661865234), (499999

  3%|▎         | 19/725 [36:29<26:43:45, 136.30s/it]

test：0.0, test mean: 0.0
([1, 0, 1], [1, 0, 1])
tensor(4.5884, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.00981955323368311), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(4.7081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.58347749710083), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(4.5775, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.827105760574341), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(4.6232, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.1273193359375), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(4.5584, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.152588367462158), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(4.5582, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.182387113571167), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(4.5580, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.21

  3%|▎         | 20/725 [40:52<34:07:59, 174.30s/it]

test：1.0, test mean: 0.125
([1, 2, 5], [1, 2, 5])
tensor(7.1902, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 21/725 [41:15<25:13:32, 128.99s/it]

([5, 5, 5], [5, 0, 5])
tensor(2.7881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0030312323942780495), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 22/725 [41:40<19:02:31, 97.51s/it] 

([5, 1, 1], [5, 1, 1])
tensor(5.4357, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.009557705372571945)]


  3%|▎         | 23/725 [42:03<14:42:27, 75.42s/it]

([0, 1, 8], [0, 1, 8])
tensor(2.5596, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 24/725 [42:27<11:38:54, 59.82s/it]

([5, 7, 2], [1, 0, 2])
tensor(6.2757, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 25/725 [42:51<9:32:09, 49.04s/it] 

([1, 2, 2], [1, 2, 2])
tensor(9.6130, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▎         | 26/725 [43:15<8:06:02, 41.72s/it]

([1, 2, 5], [1, 2, 5])
tensor(6.5659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▎         | 27/725 [43:40<7:03:57, 36.44s/it]

([5, 2, 2], [5, 2, 2])
tensor(9.5054, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -0.2663199305534363), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(9.1106, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 1.1911048889160156), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(8.7988, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 1.2543374300003052), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(8.7985, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 1.3128752708435059), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(8.7983, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 1.368053674697876), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(8.7981, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 1.419219970703125), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(8.7979, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 1.4673609733581543), (499999, 0.0), (499999,

  4%|▍         | 28/725 [47:57<19:54:08, 102.80s/it]

test：0.0, test mean: 0.1111111111111111
([7, 1, 7], [7, 1, 7])
tensor(8.3696, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.20880326628684998), (499999, 0.0), (380927, 0.21015596389770508)]
([7, 1, 7], [7, 1, 7])
tensor(8.1178, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -4.031685829162598), (499999, 0.0), (380927, -10.860811233520508)]
([7, 1, 7], [7, 1, 7])
tensor(5.1930, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.0570220947265625), (499999, 0.0), (380927, -7.548801898956299)]
([0, 1, 0], [7, 1, 7])
tensor(3.1552, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.0644052028656006), (499999, 0.0), (380927, -7.106846332550049)]
([0, 1, 0], [7, 1, 7])
tensor(3.0927, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.1004374027252197), (499999, 0.0), (380927, -7.759419918060303)]
([0, 1, 0], [7, 1, 7])
tensor(3.0716, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.1166751384735107), (499999, 0.0), (380927, -8.140527725219727)]
([0

  4%|▍         | 29/725 [52:27<29:34:10, 152.95s/it]

test：0.0, test mean: 0.1
([5, 0, 1], [5, 0, 1])
tensor(2.2989, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.000982522265985608), (499999, 0.0), (499999, 0.0)]


  4%|▍         | 30/725 [52:51<22:04:45, 114.37s/it]

([7, 7, 1], [7, 7, 1])
tensor(6.9547, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.20731960237026215), (368639, 0.20674538612365723), (499999, 0.0)]
([7, 7, 1], [7, 7, 1])
tensor(6.7262, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -12.56232738494873), (368639, -11.350728988647461), (499999, 0.0)]
([7, 7, 1], [7, 7, 1])
tensor(3.7670, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -9.98906135559082), (368639, -6.309220314025879), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(2.6377, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -10.100078582763672), (368639, -6.369980812072754), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(2.6355, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -10.19202995300293), (368639, -6.4172539710998535), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(2.6335, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -10.277069091796875), (368639, -6.455286979675293), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(2.6326, dev

  4%|▍         | 31/725 [57:08<30:14:59, 156.92s/it]

test：0.0, test mean: 0.09090909090909091
([1, 7, 7], [1, 7, 7])
tensor(6.5809, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.21022117137908936)]
([1, 7, 7], [1, 7, 7])
tensor(6.4907, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -9.797621726989746)]
([1, 7, 7], [1, 7, 7])
tensor(4.9325, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.0846967697143555)]
([1, 7, 7], [1, 7, 7])
tensor(3.3332, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.6944159269332886)]
([1, 7, 0], [1, 7, 7])
tensor(2.8914, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.7777328491210938)]
([1, 7, 0], [1, 7, 7])
tensor(2.8187, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.6564251184463501)]
([1, 7, 0], [1, 7, 7])
tensor(2.7934, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999

  4%|▍         | 32/725 [1:01:47<37:16:10, 193.61s/it]

test：0.0, test mean: 0.08333333333333333
([2, 2, 8], [2, 2, 5])
tensor(9.9729, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.00099668325856328)]


  5%|▍         | 33/725 [1:02:14<27:36:38, 143.64s/it]

([2, 1, 1], [2, 1, 1])
tensor(9.4959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 34/725 [1:02:38<20:40:30, 107.71s/it]

([2, 2, 2], [2, 2, 2])
tensor(11.8428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 35/725 [1:03:02<15:50:16, 82.63s/it] 

([7, 5, 1], [7, 6, 1])
tensor(2.2925, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 36/725 [1:03:27<12:31:17, 65.42s/it]

([3, 2, 0], [3, 2, 0])
tensor(7.5462, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 0.011441510170698166), (499999, 0.0), (499999, 0.0)]
([3, 2, 0], [3, 2, 0])
tensor(7.2063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 3.831432580947876), (499999, 0.0), (499999, 0.0)]
([3, 2, 0], [3, 2, 0])
tensor(7.2585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 9.312765121459961), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [3, 2, 0])
tensor(6.4713, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 9.715995788574219), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [3, 2, 0])
tensor(6.4563, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 9.722175598144531), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [3, 2, 0])
tensor(6.4485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 9.851738929748535), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [3, 2, 0])
tensor(6.4456, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 10.010527610778809), (499999, 0.0), (499999, 0

  5%|▌         | 37/725 [1:07:48<23:41:54, 124.00s/it]

test：0.0, test mean: 0.07692307692307693
([2, 2, 8], [2, 2, 8])
tensor(10.6700, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▌         | 38/725 [1:08:12<17:58:12, 94.17s/it] 

([8, 3, 1], [8, 3, 1])
tensor(6.3060, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0006637040060013533), (499999, 0.0), (499999, 0.0)]


  5%|▌         | 39/725 [1:08:38<14:01:27, 73.60s/it]

([2, 5, 7], [2, 3, 7])
tensor(8.2267, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.3479320704936981)]
([2, 5, 7], [2, 3, 7])
tensor(8.5647, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -1.6599409580230713)]
([2, 5, 7], [2, 3, 7])
tensor(6.6538, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 1.555189847946167)]
([2, 5, 7], [2, 3, 7])
tensor(4.7385, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -5.942605018615723)]
([2, 5, 0], [2, 3, 7])
tensor(4.5667, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.345849990844727)]
([2, 5, 0], [2, 3, 7])
tensor(4.5147, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.82626485824585)]
([2, 5, 0], [2, 3, 7])
tensor(4.5070, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -5.1833639144

  6%|▌         | 40/725 [1:13:18<25:46:41, 135.48s/it]

test：0.0, test mean: 0.07142857142857142
([3, 0, 2], [3, 0, 2])
tensor(5.1912, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -0.791638970375061), (499999, 0.0), (499999, 0.0)]
([3, 0, 2], [3, 0, 2])
tensor(5.1260, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -1.996485710144043), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(4.1835, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -0.9140093326568604), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(4.1011, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -0.8224098086357117), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(4.0693, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -0.21830976009368896), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(4.0570, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, 0.5388097763061523), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(4.0250, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471

  6%|▌         | 41/725 [1:17:37<32:48:39, 172.69s/it]

test：0.0, test mean: 0.06666666666666667
([7, 2, 2], [7, 2, 2])
tensor(12.1434, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21018220484256744), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(12.2603, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.181716918945312), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(10.7199, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.015775680541992), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(9.3195, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.916984558105469), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(9.2887, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.15355110168457), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(9.2812, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.247612953186035), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(9.2765, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927,

  6%|▌         | 42/725 [1:21:59<37:48:24, 199.28s/it]

test：0.0, test mean: 0.0625
([8, 6, 2], [0, 6, 2])
tensor(5.0147, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 43/725 [1:22:23<27:49:25, 146.87s/it]

([2, 2, 1], [2, 2, 1])
tensor(8.4863, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 44/725 [1:22:47<20:47:49, 109.94s/it]

([2, 1, 3], [2, 1, 3])
tensor(6.8105, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, 0.07848465442657471)]
([2, 1, 3], [2, 1, 3])
tensor(6.5310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, 5.389955043792725)]
([2, 1, 0], [2, 1, 3])
tensor(6.4168, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, 13.063579559326172)]
([2, 1, 0], [2, 1, 3])
tensor(6.1015, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, 12.80688762664795)]
([2, 1, 0], [2, 1, 3])
tensor(6.0979, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, 12.608464241027832)]
([2, 1, 0], [2, 1, 3])
tensor(6.0960, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, 12.451057434082031)]
([2, 1, 0], [2, 1, 3])
tensor(6.0953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, 12.32031726837

  6%|▌         | 45/725 [1:28:03<32:27:49, 171.87s/it]

test：0.0, test mean: 0.058823529411764705
([2, 1, 2], [2, 1, 2])
tensor(9.4003, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▋         | 46/725 [1:28:35<24:28:48, 129.79s/it]

([2, 1, 8], [2, 1, 5])
tensor(7.0549, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0013595614582300186), (499999, 0.0), (499999, 0.0)]


  6%|▋         | 47/725 [1:29:06<18:53:08, 100.28s/it]

([5, 1, 2], [1, 1, 2])
tensor(6.2068, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.001881428062915802), (499999, 0.0)]


  7%|▋         | 48/725 [1:29:36<14:51:18, 78.99s/it] 

([2, 3, 1], [2, 3, 1])
tensor(6.2826, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 49/725 [1:30:03<11:56:17, 63.58s/it]

([2, 1, 2], [2, 1, 2])
tensor(11.6193, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0004824787611141801), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 50/725 [1:30:33<10:00:47, 53.40s/it]

([8, 5, 1], [8, 5, 1])
tensor(6.0231, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 51/725 [1:31:02<8:39:04, 46.21s/it] 

([2, 7, 1], [2, 0, 1])
tensor(6.7212, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.004680681973695755), (499999, 0.0)]


  7%|▋         | 52/725 [1:31:32<7:40:53, 41.09s/it]

([2, 5, 5], [2, 0, 5])
tensor(5.8087, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 53/725 [1:31:59<6:53:25, 36.91s/it]

([2, 2, 2], [2, 2, 2])
tensor(10.6269, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 54/725 [1:32:28<6:26:20, 34.55s/it]

([1, 2, 8], [1, 2, 5])
tensor(8.0159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 55/725 [1:32:54<5:56:46, 31.95s/it]

([5, 1, 2], [1, 1, 2])
tensor(8.0012, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 56/725 [1:33:22<5:42:50, 30.75s/it]

([5, 7, 3], [5, 7, 5])
tensor(5.9419, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21019183099269867), (499999, 0.0)]
([5, 7, 3], [5, 7, 5])
tensor(5.9346, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.092045783996582), (499999, 0.0)]
([5, 7, 3], [5, 7, 5])
tensor(4.3683, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.033146381378174), (499999, 0.0)]
([5, 0, 3], [5, 7, 5])
tensor(3.0729, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.88291597366333), (499999, 0.0)]
([5, 0, 3], [5, 7, 5])
tensor(3.0038, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.232389450073242), (499999, 0.0)]
([5, 0, 3], [5, 7, 5])
tensor(2.9837, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.4239397048950195), (499999, 0.0)]
([5, 0, 3], [5, 7, 5])
tensor(2.9770, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.617565155029297), (49999

  8%|▊         | 57/725 [1:38:39<21:40:18, 116.79s/it]

test：0.0, test mean: 0.05555555555555555
([5, 7, 1], [5, 7, 1])
tensor(4.1718, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 58/725 [1:39:10<16:50:12, 90.87s/it] 

([1, 0, 0], [1, 0, 0])
tensor(3.1172, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 59/725 [1:39:41<13:30:22, 73.01s/it]

([1, 0, 8], [1, 0, 8])
tensor(3.1147, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 60/725 [1:40:10<11:04:18, 59.94s/it]

([7, 0, 8], [7, 0, 8])
tensor(0.7450, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.005944561213254929), (499999, 0.0)]
([7, 0, 8], [7, 0, 8])
tensor(0.7549, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.35849714279174805), (499999, 0.0)]
([7, 0, 8], [7, 0, 8])
tensor(0.7414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.4294719398021698), (499999, 0.0)]
([7, 0, 8], [7, 0, 8])
tensor(0.7420, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.6199008226394653), (499999, 0.0)]
([7, 0, 8], [7, 0, 8])
tensor(0.7334, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.6525689959526062), (499999, 0.0)]
([7, 0, 8], [7, 0, 8])
tensor(0.7332, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.7066066265106201), (499999, 0.0)]
([7, 0, 8], [7, 0, 8])
tensor(0.7318, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -0.7302587032318115

  8%|▊         | 61/725 [1:45:08<24:13:24, 131.33s/it]

test：1.0, test mean: 0.10526315789473684
([1, 2, 0], [1, 2, 0])
tensor(6.1965, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.01087838876992464), (499999, 0.0), (499999, 0.0)]


  9%|▊         | 62/725 [1:45:36<18:29:14, 100.38s/it]

([5, 3, 3], [5, 3, 3])
tensor(2.8689, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -0.6085019111633301), (196607, -0.1204059049487114)]
([5, 3, 3], [5, 3, 3])
tensor(2.4651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, 0.150116428732872), (196607, 4.942143440246582)]
([5, 3, 8], [5, 3, 3])
tensor(2.4329, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, 0.6619023084640503), (196607, 7.97843599319458)]
([5, 3, 0], [5, 3, 3])
tensor(1.8946, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, 0.9640666246414185), (196607, 8.357267379760742)]
([5, 3, 0], [5, 3, 3])
tensor(1.8576, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, 2.2323126792907715), (196607, 8.655778884887695)]
([5, 3, 0], [5, 3, 3])
tensor(1.8077, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, 1.6576766967773438), (196607, 8.820871353149414)]
([5, 0, 0], [5, 3, 3])
tensor(1.0065, device='cuda:

  9%|▊         | 63/725 [1:51:31<32:27:27, 176.51s/it]

test：0.0, test mean: 0.1
([0, 3, 1], [0, 3, 1])
tensor(2.6690, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 64/725 [1:52:00<24:19:01, 132.44s/it]

([8, 6, 2], [8, 6, 2])
tensor(8.8224, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 65/725 [1:52:30<18:39:33, 101.78s/it]

([1, 1, 0], [1, 1, 0])
tensor(3.4966, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00018687802366912365), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 66/725 [1:53:00<14:40:26, 80.16s/it] 

([7, 1, 1], [7, 1, 1])
tensor(6.7991, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.014288829639554024)]


  9%|▉         | 67/725 [1:53:29<11:50:29, 64.79s/it]

([1, 3, 0], [1, 3, 0])
tensor(2.6118, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.6965276002883911), (499999, 0.0)]
([1, 3, 0], [1, 3, 0])
tensor(2.5941, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -3.4951727390289307), (499999, 0.0)]
([1, 0, 0], [1, 3, 0])
tensor(1.8284, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.9271312952041626), (499999, 0.0)]
([1, 0, 0], [1, 3, 0])
tensor(1.4920, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 0.14495986700057983), (499999, 0.0)]
([1, 0, 0], [1, 3, 0])
tensor(1.4031, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 0.24916160106658936), (499999, 0.0)]
([1, 0, 0], [1, 3, 0])
tensor(1.4013, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 0.33266758918762207), (499999, 0.0)]
([1, 0, 0], [1, 3, 0])
tensor(1.4007, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 0.4033138155937195), (

  9%|▉         | 68/725 [1:59:17<27:20:25, 149.81s/it]

test：0.0, test mean: 0.09523809523809523
([8, 2, 1], [8, 2, 1])
tensor(7.4264, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.009762611240148544), (499999, 0.0)]


 10%|▉         | 69/725 [1:59:44<20:35:40, 113.02s/it]

([2, 5, 2], [2, 6, 2])
tensor(10.5850, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|▉         | 70/725 [2:00:10<15:47:19, 86.78s/it] 

([5, 5, 8], [5, 5, 8])
tensor(3.2956, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -0.2522118389606476), (499999, 0.0), (499999, 0.0)]
([0, 5, 8], [5, 5, 8])
tensor(2.9522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.1988445520401001), (499999, 0.0), (499999, 0.0)]
([0, 5, 8], [5, 5, 8])
tensor(2.8249, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.21435627341270447), (499999, 0.0), (499999, 0.0)]
([0, 5, 8], [5, 5, 8])
tensor(2.8248, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.2274034321308136), (499999, 0.0), (499999, 0.0)]
([0, 5, 8], [5, 5, 8])
tensor(2.8248, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.24075102806091309), (499999, 0.0), (499999, 0.0)]
([0, 5, 8], [5, 5, 8])
tensor(2.8248, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.25388699769973755), (499999, 0.0), (499999, 0.0)]
([0, 5, 8], [5, 5, 8])
tensor(2.8248, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.2670232653617859), (499999, 0.0), (49

 10%|▉         | 71/725 [2:04:47<26:06:45, 143.74s/it]

test：0.0, test mean: 0.09090909090909091
([5, 6, 1], [5, 4, 1])
tensor(3.6473, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|▉         | 72/725 [2:05:15<19:46:24, 109.01s/it]

([1, 1, 6], [1, 1, 6])
tensor(6.8733, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|█         | 73/725 [2:05:40<15:11:19, 83.86s/it] 

([2, 2, 3], [2, 2, 3])
tensor(11.8221, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|█         | 74/725 [2:06:05<12:00:02, 66.36s/it]

([5, 5, 7], [7, 5, 7])
tensor(1.3203, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -0.4073481857776642), (499999, 0.0)]
([5, 1, 7], [7, 5, 7])
tensor(1.1870, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 9.408577919006348), (499999, 0.0)]
([5, 0, 7], [7, 5, 7])
tensor(0.8643, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 17.846052169799805), (499999, 0.0)]
([5, 0, 7], [7, 5, 7])
tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 17.923667907714844), (499999, 0.0)]
([5, 0, 7], [7, 5, 7])
tensor(0.6883, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 17.913772583007812), (499999, 0.0)]
([5, 0, 7], [7, 5, 7])
tensor(0.6873, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 17.96221923828125), (499999, 0.0)]
([5, 0, 7], [7, 5, 7])
tensor(0.6868, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 17.957429885864258), (499999,

 10%|█         | 75/725 [2:10:41<23:19:09, 129.15s/it]

test：0.0, test mean: 0.08695652173913043
([6, 0, 1], [6, 0, 1])
tensor(3.5692, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.0011882721446454525), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(3.7170, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 4.58638858795166), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(3.5556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 5.00778865814209), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(3.5782, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 5.346301555633545), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(3.5371, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 5.375274658203125), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(3.5368, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 5.399592876434326), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(3.5367, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0),

 10%|█         | 76/725 [2:15:57<33:22:08, 185.10s/it]

test：1.0, test mean: 0.125
([7, 0, 0], [7, 0, 7])
tensor(3.4705, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.2101166546344757), (499999, 0.0), (499999, 0.0)]
([7, 0, 0], [7, 0, 7])
tensor(3.2831, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.347166538238525), (499999, 0.0), (499999, 0.0)]
([7, 0, 0], [7, 0, 7])
tensor(1.8533, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.8528374433517456), (499999, 0.0), (499999, 0.0)]
([0, 0, 0], [7, 0, 7])
tensor(0.3982, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.489010810852051), (499999, 0.0), (499999, 0.0)]
([0, 0, 0], [7, 0, 7])
tensor(0.3538, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.8513617515563965), (499999, 0.0), (499999, 0.0)]
([0, 0, 0], [7, 0, 7])
tensor(0.3416, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.0639631748199463), (499999, 0.0), (499999, 0.0)]
([0, 0, 0], [7, 0, 7])
tensor(0.3360, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.2464337348

 11%|█         | 77/725 [2:21:15<40:29:14, 224.93s/it]

test：0.0, test mean: 0.12
([0, 2, 2], [0, 2, 2])
tensor(10.2291, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.006206104066222906), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(10.3720, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -4.601059913635254), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(10.2249, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -4.69771671295166), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(10.3981, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -9.535026550292969), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(10.2236, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -9.626311302185059), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(10.3817, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -13.960773468017578), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(10.2217, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -14.0436

 11%|█         | 78/725 [2:26:31<45:22:50, 252.50s/it]

test：1.0, test mean: 0.15384615384615385
([1, 6, 0], [1, 6, 0])
tensor(3.1438, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.05956359952688217)]
([1, 6, 0], [1, 6, 0])
tensor(3.0989, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -1.8581267595291138)]
([1, 6, 0], [1, 6, 0])
tensor(3.0676, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -1.7984545230865479)]
([1, 6, 0], [1, 6, 0])
tensor(3.0523, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -1.7864207029342651)]
([1, 6, 0], [1, 6, 0])
tensor(3.0422, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -1.86820650100708)]
([1, 6, 0], [1, 6, 0])
tensor(3.0401, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -1.8917033672332764)]
([1, 6, 0], [1, 6, 0])
tensor(3.0381, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49999

 11%|█         | 79/725 [2:31:54<49:03:42, 273.41s/it]

test：1.0, test mean: 0.18518518518518517
([7, 1, 1], [7, 1, 1])
tensor(7.6863, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21017828583717346), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(7.7916, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.09185791015625), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(6.2173, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.903558731079102), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(4.8693, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.814618110656738), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(4.8523, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.028872489929199), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(4.8449, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.16883659362793), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(4.8413, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.

 11%|█         | 80/725 [2:37:02<50:52:18, 283.94s/it]

test：0.0, test mean: 0.17857142857142858
([6, 0, 6], [6, 0, 6])
tensor(3.0671, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 81/725 [2:37:34<37:16:50, 208.40s/it]

([0, 6, 0], [0, 6, 0])
tensor(1.4953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█▏        | 82/725 [2:38:04<27:39:18, 154.83s/it]

([1, 1, 2], [1, 1, 2])
tensor(9.7693, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█▏        | 83/725 [2:38:35<20:58:35, 117.63s/it]

([0, 2, 1], [1, 2, 1])
tensor(6.3726, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, 0.02290264517068863), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [1, 2, 1])
tensor(6.5651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -0.19841206073760986), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [1, 2, 1])
tensor(6.2310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, 0.17079931497573853), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [1, 2, 1])
tensor(6.2146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, 0.32577815651893616), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [1, 2, 1])
tensor(6.2116, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, 0.44422101974487305), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [1, 2, 1])
tensor(6.2093, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, 0.5369817614555359), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [1, 2, 1])
tensor(6.2087, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, 0.6182448863983154), (499999, 0.0), (

 12%|█▏        | 84/725 [2:44:05<32:19:21, 181.53s/it]

test：0.0, test mean: 0.1724137931034483
([5, 1, 1], [8, 1, 1])
tensor(4.8640, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 85/725 [2:44:38<24:18:36, 136.74s/it]

([7, 5, 5], [7, 8, 8])
tensor(5.1712, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21022023260593414), (499999, 0.0), (499999, 0.0)]
([7, 5, 5], [7, 8, 8])
tensor(4.9681, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.797564506530762), (499999, 0.0), (499999, 0.0)]
([7, 5, 5], [7, 8, 8])
tensor(3.3073, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.088959217071533), (499999, 0.0), (499999, 0.0)]
([7, 5, 5], [7, 8, 8])
tensor(1.7426, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.23986327648162842), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [7, 8, 8])
tensor(1.3898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.1616644859313965), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [7, 8, 8])
tensor(1.3361, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.343677043914795), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [7, 8, 8])
tensor(1.3165, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.2773511409759521), (499999, 0.0), (49

 12%|█▏        | 86/725 [2:50:06<34:28:23, 194.22s/it]

test：0.0, test mean: 0.16666666666666666
([7, 3, 8], [7, 3, 8])
tensor(4.3759, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.20935343205928802), (499999, 0.0), (499999, 0.0)]
([7, 3, 8], [7, 3, 8])
tensor(4.4301, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.335918426513672), (499999, 0.0), (499999, 0.0)]
([7, 3, 8], [7, 3, 8])
tensor(2.9347, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.6300883293151855), (499999, 0.0), (499999, 0.0)]
([0, 3, 8], [7, 3, 8])
tensor(2.4625, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.541926860809326), (499999, 0.0), (499999, 0.0)]
([0, 3, 8], [7, 3, 8])
tensor(2.4616, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.496508598327637), (499999, 0.0), (499999, 0.0)]
([0, 3, 8], [7, 3, 8])
tensor(2.4613, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.474739074707031), (499999, 0.0), (499999, 0.0)]
([0, 3, 8], [7, 3, 8])
tensor(2.4610, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 

 12%|█▏        | 87/725 [2:55:12<40:21:46, 227.75s/it]

test：0.0, test mean: 0.16129032258064516
([0, 0, 5], [0, 0, 5])
tensor(0.9862, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 88/725 [2:55:41<29:43:41, 168.01s/it]

([2, 1, 5], [2, 1, 5])
tensor(6.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 89/725 [2:56:11<22:24:41, 126.86s/it]

([2, 1, 5], [2, 1, 8])
tensor(5.6031, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 90/725 [2:56:42<17:18:09, 98.09s/it] 

([1, 3, 7], [1, 7, 7])
tensor(4.5233, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 91/725 [2:57:13<13:43:07, 77.90s/it]

([1, 1, 2], [1, 1, 2])
tensor(6.1416, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 92/725 [2:57:43<11:09:20, 63.44s/it]

([1, 5, 3], [1, 3, 3])
tensor(4.4722, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0010735820978879929), (499999, 0.00038736092392355204), (499999, -0.00031610875157639384)]


 13%|█▎        | 93/725 [2:58:15<9:29:06, 54.03s/it] 

([2, 3, 1], [2, 3, 1])
tensor(7.9753, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 94/725 [2:58:46<8:15:35, 47.12s/it]

([0, 8, 3], [0, 8, 3])
tensor(3.6201, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 95/725 [2:59:17<7:25:14, 42.40s/it]

([8, 2, 2], [8, 2, 2])
tensor(11.4657, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 96/725 [2:59:50<6:52:38, 39.36s/it]

([2, 1, 1], [2, 1, 1])
tensor(7.9395, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 97/725 [3:00:18<6:18:16, 36.14s/it]

([8, 2, 3], [8, 2, 3])
tensor(7.6218, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.01238060649484396)]


 14%|█▎        | 98/725 [3:00:50<6:03:08, 34.75s/it]

([1, 0, 8], [1, 0, 8])
tensor(2.6566, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, 0.0045595960691571236), (499999, 0.0)]
([1, 0, 8], [1, 0, 8])
tensor(2.8966, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -1.7732930183410645), (499999, 0.0)]
([1, 0, 8], [1, 0, 8])
tensor(2.6527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -1.9372419118881226), (499999, 0.0)]
([1, 0, 8], [1, 0, 8])
tensor(2.8844, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -3.49515438079834), (499999, 0.0)]
([1, 0, 8], [1, 0, 8])
tensor(2.6382, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -3.5448992252349854), (499999, 0.0)]
([1, 0, 8], [1, 0, 8])
tensor(2.6428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -3.585134267807007), (499999, 0.0)]
([1, 0, 8], [1, 0, 8])
tensor(2.6373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -3.634916305541992), (4

 14%|█▎        | 99/725 [3:06:14<21:09:17, 121.66s/it]

test：1.0, test mean: 0.1875
([2, 8, 5], [2, 8, 0])
tensor(6.3940, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 14%|█▍        | 100/725 [3:06:53<16:48:48, 96.85s/it]

([1, 2, 0], [1, 2, 0])
tensor(4.6459, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 14%|█▍        | 101/725 [3:07:35<13:55:55, 80.38s/it]

([2, 2, 0], [2, 2, 7])
tensor(8.6313, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.00010359505540691316)]


 14%|█▍        | 102/725 [3:08:13<11:43:18, 67.73s/it]

([1, 1, 1], [1, 1, 1])
tensor(7.5195, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0013250585179775953), (499999, 0.0), (499999, 0.0)]


 14%|█▍        | 103/725 [3:08:52<10:12:50, 59.12s/it]

([1, 1, 1], [1, 1, 1])
tensor(4.8319, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 14%|█▍        | 104/725 [3:09:27<8:56:37, 51.85s/it] 

([8, 1, 3], [7, 1, 3])
tensor(4.6270, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -0.013166490942239761)]
([8, 1, 3], [7, 1, 3])
tensor(4.1109, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 5.33681058883667)]
([8, 1, 3], [7, 1, 3])
tensor(4.0249, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 12.495638847351074)]
([8, 1, 0], [7, 1, 3])
tensor(3.5340, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 12.321321487426758)]
([8, 1, 0], [7, 1, 3])
tensor(3.5310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 12.191150665283203)]
([8, 1, 0], [7, 1, 3])
tensor(3.5297, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 12.098196029663086)]
([8, 1, 0], [7, 1, 3])
tensor(3.5290, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 12.022300720

 14%|█▍        | 105/725 [3:15:31<25:03:07, 145.46s/it]

test：0.0, test mean: 0.18181818181818182
([7, 5, 2], [7, 5, 2])
tensor(6.6543, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▍        | 106/725 [3:16:05<19:14:01, 111.86s/it]

([5, 1, 1], [3, 1, 1])
tensor(5.9510, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▍        | 107/725 [3:16:31<14:48:27, 86.26s/it] 

([5, 7, 0], [5, 7, 0])
tensor(4.0626, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21019169688224792), (499999, 0.0)]
([5, 7, 0], [5, 7, 0])
tensor(4.0486, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.092050552368164), (499999, 0.0)]
([5, 7, 0], [5, 7, 0])
tensor(2.4928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.897960186004639), (499999, 0.0)]
([5, 0, 0], [5, 7, 0])
tensor(1.1201, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.812355995178223), (499999, 0.0)]
([5, 0, 0], [5, 7, 0])
tensor(1.0950, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.030643463134766), (499999, 0.0)]
([5, 0, 0], [5, 7, 0])
tensor(1.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.174217700958252), (499999, 0.0)]
([5, 0, 0], [5, 7, 0])
tensor(1.0844, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.21346378326416), (499999

 15%|█▍        | 108/725 [3:21:41<26:17:38, 153.42s/it]

test：0.0, test mean: 0.17647058823529413
([7, 1, 2], [7, 1, 2])
tensor(5.8047, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.20863589644432068), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(5.9081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -12.57136344909668), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(4.4286, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -7.61333703994751), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(3.7414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -7.773785591125488), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(3.7371, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -7.885397911071777), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(3.7356, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -7.983453273773193), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(3.7339, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8

 15%|█▌        | 109/725 [3:27:59<37:44:53, 220.61s/it]

test：0.0, test mean: 0.17142857142857143
([8, 1, 0], [8, 1, 0])
tensor(2.6759, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▌        | 110/725 [3:28:32<28:05:28, 164.44s/it]

([2, 7, 1], [2, 7, 1])
tensor(8.0590, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21011866629123688), (499999, 0.004408417269587517)]
([2, 7, 1], [2, 7, 1])
tensor(7.8954, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -6.3478922843933105), (499999, 0.008816834539175034)]
([2, 7, 1], [2, 7, 1])
tensor(6.4655, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.0938761234283447), (499999, 0.0132252536714077)]
([2, 0, 1], [2, 7, 1])
tensor(4.9929, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.82974112033844), (499999, 0.017633669078350067)]
([2, 0, 1], [2, 7, 1])
tensor(4.9262, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.190276861190796), (499999, 0.022042088210582733)]
([2, 0, 1], [2, 7, 1])
tensor(4.9155, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.419862747192383), (499999, 0.0264505073428154)]
([2, 0, 1], [2, 7, 1])
tensor(4.9125

 15%|█▌        | 111/725 [3:34:40<38:28:22, 225.57s/it]

test：0.0, test mean: 0.16666666666666666
([8, 0, 8], [8, 0, 8])
tensor(3.2675, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.000704711361322552)]


 15%|█▌        | 112/725 [3:35:11<28:25:58, 166.98s/it]

([5, 1, 0], [6, 1, 0])
tensor(2.6359, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 16%|█▌        | 113/725 [3:35:46<21:40:16, 127.48s/it]

([1, 5, 8], [1, 5, 8])
tensor(4.3554, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.008515957742929459), (499999, 0.0)]


 16%|█▌        | 114/725 [3:36:16<16:40:19, 98.23s/it] 

([0, 1, 3], [0, 1, 3])
tensor(2.9562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 16%|█▌        | 115/725 [3:36:48<13:17:18, 78.42s/it]

([8, 2, 0], [7, 2, 0])
tensor(4.5713, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -0.001056453213095665)]
([8, 2, 0], [7, 2, 0])
tensor(4.5703, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -0.01461155153810978)]
([8, 2, 0], [7, 2, 0])
tensor(4.5701, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -0.024788152426481247)]
([8, 2, 0], [7, 2, 0])
tensor(4.5676, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -0.037203047424554825)]
([8, 2, 0], [7, 2, 0])
tensor(4.5662, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -0.04576968401670456)]
([8, 2, 0], [7, 2, 0])
tensor(4.5659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -0.05935771018266678)]
([8, 2, 0], [7, 2, 0])
tensor(4.5655, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639

 16%|█▌        | 116/725 [3:42:35<26:55:02, 159.12s/it]

test：1.0, test mean: 0.1891891891891892
([2, 1, 2], [2, 1, 2])
tensor(9.9779, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 16%|█▌        | 117/725 [3:43:10<20:33:48, 121.76s/it]

([2, 6, 8], [2, 6, 8])
tensor(7.8217, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 16%|█▋        | 118/725 [3:43:40<15:54:01, 94.30s/it] 

([1, 1, 2], [1, 1, 2])
tensor(6.3805, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 16%|█▋        | 119/725 [3:44:10<12:37:45, 75.03s/it]

([8, 3, 6], [8, 3, 6])
tensor(3.7952, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 120/725 [3:44:41<10:21:27, 61.63s/it]

([5, 7, 5], [8, 7, 2])
tensor(2.8169, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.3277040421962738), (499999, -0.00018808944150805473)]
([5, 7, 5], [8, 7, 2])
tensor(3.0083, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.4457521438598633), (499999, -0.00037617888301610947)]
([5, 0, 5], [8, 7, 2])
tensor(2.2760, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.3697570562362671), (499999, -0.0005642683245241642)]
([5, 0, 5], [8, 7, 2])
tensor(2.1948, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.5115973353385925), (499999, -0.0007523577660322189)]
([5, 0, 5], [8, 7, 2])
tensor(2.1928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.6126220226287842), (499999, -0.000940447673201561)]
([5, 0, 5], [8, 7, 2])
tensor(2.1923, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.7054505348205566), (499999, -0.0011285357177257538)]
([5, 0, 5], [8, 7, 2

 17%|█▋        | 121/725 [3:50:41<25:21:49, 151.18s/it]

test：0.0, test mean: 0.18421052631578946
([5, 7, 2], [2, 7, 2])
tensor(8.6882, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2212291955947876), (499999, 0.0)]
([5, 7, 2], [2, 7, 2])
tensor(8.7727, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -8.208015441894531), (499999, 0.0)]
([5, 7, 2], [2, 7, 2])
tensor(6.9946, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.036356449127197), (499999, 0.0)]
([5, 7, 2], [2, 7, 2])
tensor(5.3539, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -7.512913703918457), (499999, 0.0)]
([5, 0, 2], [2, 7, 2])
tensor(5.0654, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.714123725891113), (499999, 0.0)]
([5, 0, 2], [2, 7, 2])
tensor(4.9981, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -10.635897636413574), (499999, 0.0)]
([5, 0, 2], [2, 7, 2])
tensor(4.9825, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0

 17%|█▋        | 122/725 [3:56:55<36:31:02, 218.01s/it]

test：0.0, test mean: 0.1794871794871795
([0, 7, 0], [8, 7, 0])
tensor(0.9440, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.022385049611330032)]
([0, 7, 0], [8, 7, 0])
tensor(1.0826, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, -0.8747965097427368)]
([0, 7, 0], [8, 7, 0])
tensor(0.8871, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, -0.5565708875656128)]
([0, 7, 0], [8, 7, 0])
tensor(0.8934, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.2663736343383789)]
([0, 7, 0], [8, 7, 0])
tensor(0.8677, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.31124791502952576)]
([0, 7, 0], [8, 7, 0])
tensor(0.8673, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.34739556908607483)]
([0, 7, 0], [8, 7, 0])
tensor(0.8669, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49999

 17%|█▋        | 123/725 [4:02:26<42:09:41, 252.13s/it]

test：1.0, test mean: 0.2
([0, 1, 0], [0, 1, 0])
tensor(3.1448, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.00654375646263361), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [0, 1, 0])
tensor(3.2366, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.9988633394241333), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [0, 1, 0])
tensor(3.1440, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -1.06786048412323), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [0, 1, 0])
tensor(3.2296, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -2.1455295085906982), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [0, 1, 0])
tensor(3.1373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -2.2026782035827637), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [0, 1, 0])
tensor(3.1728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -3.333728551864624), (499999, 0.0), (499999, 0.0)]
([0, 1, 0], [0, 1, 0])
tensor(3.1361, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -3.37700057029

 17%|█▋        | 124/725 [4:07:49<45:35:44, 273.12s/it]

test：1.0, test mean: 0.21951219512195122
([2, 0, 2], [2, 0, 2])
tensor(9.1467, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 125/725 [4:08:17<33:16:41, 199.67s/it]

([5, 2, 6], [5, 2, 6])
tensor(6.0826, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 126/725 [4:08:44<24:37:59, 148.05s/it]

([1, 5, 2], [1, 5, 2])
tensor(6.5383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 2.383102400926873e-05)]


 18%|█▊        | 127/725 [4:09:13<18:39:33, 112.33s/it]

([0, 8, 1], [0, 8, 1])
tensor(4.6715, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 18%|█▊        | 128/725 [4:09:43<14:30:46, 87.52s/it] 

([1, 1, 5], [1, 1, 1])
tensor(3.4345, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.008185875602066517), (499999, 0.0)]


 18%|█▊        | 129/725 [4:10:13<11:37:08, 70.18s/it]

([2, 0, 8], [2, 0, 8])
tensor(4.6201, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 18%|█▊        | 130/725 [4:10:41<9:31:42, 57.65s/it] 

([2, 2, 2], [2, 2, 2])
tensor(13.2599, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0036626337096095085), (499999, 0.0)]


 18%|█▊        | 131/725 [4:11:10<8:04:06, 48.90s/it]

([1, 2, 0], [1, 2, 0])
tensor(7.8799, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 18%|█▊        | 132/725 [4:11:39<7:05:50, 43.09s/it]

([1, 2, 7], [1, 2, 7])
tensor(8.0701, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.20985053479671478)]
([1, 2, 7], [1, 2, 7])
tensor(8.1159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -7.632512092590332)]
([1, 2, 7], [1, 2, 7])
tensor(6.5301, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.354752540588379)]
([1, 2, 0], [1, 2, 7])
tensor(5.7534, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.214088439941406)]
([1, 2, 0], [1, 2, 7])
tensor(5.7520, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.185461044311523)]
([1, 2, 0], [1, 2, 7])
tensor(5.7517, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.156064033508301)]
([1, 2, 0], [1, 2, 7])
tensor(5.7514, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.144565582

 18%|█▊        | 133/725 [4:17:25<22:01:30, 133.94s/it]

test：0.0, test mean: 0.21428571428571427
([0, 1, 1], [0, 1, 1])
tensor(3.9747, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -0.00209122896194458), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [0, 1, 1])
tensor(4.0834, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -2.958533763885498), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [0, 1, 1])
tensor(3.9741, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -3.0145480632781982), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [0, 1, 1])
tensor(4.0739, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -5.295422554016113), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [0, 1, 1])
tensor(3.9678, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -5.314051628112793), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [0, 1, 1])
tensor(3.9669, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -5.347550868988037), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [0, 1, 1])
tensor(3.9665, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079,

 18%|█▊        | 134/725 [4:22:31<30:25:54, 185.37s/it]

test：1.0, test mean: 0.23255813953488372
([2, 1, 0], [2, 1, 0])
tensor(7.4646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -0.17160922288894653)]
([2, 1, 0], [2, 1, 0])
tensor(7.4684, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 2.2358291149139404)]
([2, 1, 0], [2, 1, 0])
tensor(7.3584, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 2.4734840393066406)]
([2, 1, 0], [2, 1, 0])
tensor(7.3360, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 2.4847936630249023)]
([2, 1, 0], [2, 1, 0])
tensor(7.3293, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 2.4676737785339355)]
([2, 1, 0], [2, 1, 0])
tensor(7.3250, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 2.4480748176574707)]
([2, 1, 0], [2, 1, 0])
tensor(7.3218, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 19%|█▊        | 135/725 [4:27:39<36:24:28, 222.15s/it]

test：1.0, test mean: 0.25
([2, 1, 1], [2, 1, 1])
tensor(7.4476, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 19%|█▉        | 136/725 [4:28:09<26:55:23, 164.56s/it]

([5, 2, 1], [5, 2, 1])
tensor(7.7228, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0007229836191982031), (499999, 0.0)]


 19%|█▉        | 137/725 [4:28:35<20:05:52, 123.05s/it]

([1, 2, 7], [1, 2, 7])
tensor(10.3127, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.21022143959999084)]
([1, 2, 7], [1, 2, 7])
tensor(10.3130, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -11.156481742858887)]
([1, 2, 7], [1, 2, 7])
tensor(8.6614, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.178177833557129)]
([1, 2, 7], [1, 2, 7])
tensor(7.0942, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.822741508483887)]
([1, 2, 0], [1, 2, 7])
tensor(6.6632, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.227829456329346)]
([1, 2, 0], [1, 2, 7])
tensor(6.5928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.103837490081787)]
([1, 2, 0], [1, 2, 7])
tensor(6.5686, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.915843

 19%|█▉        | 138/725 [4:34:01<29:58:38, 183.85s/it]

test：0.0, test mean: 0.24444444444444444
([8, 7, 7], [8, 7, 7])
tensor(6.7360, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.30120617151260376), (380927, 0.22119253873825073), (380927, 0.21019130945205688)]
([7, 7, 7], [8, 7, 7])
tensor(6.8887, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.8704118728637695), (380927, -6.58513069152832), (380927, -7.354665756225586)]
([0, 7, 7], [8, 7, 7])
tensor(3.0297, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.8276333808898926), (380927, -3.9413819313049316), (380927, -0.4373376965522766)]
([0, 0, 0], [8, 7, 7])
tensor(0.0988, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.809699058532715), (380927, -4.097410678863525), (380927, 0.3895733952522278)]
([0, 0, 0], [8, 7, 7])
tensor(0.0525, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.797093391418457), (380927, -4.277316093444824), (380927, 0.6178244948387146)]
([0, 0, 0], [8, 7, 7])
tensor(0.0432, device='cuda:0', grad_fn=<NllLossBackward0>)
[(45

 19%|█▉        | 139/725 [4:39:30<37:01:50, 227.49s/it]

test：0.0, test mean: 0.2391304347826087
([3, 1, 5], [3, 1, 8])
tensor(5.0269, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 19%|█▉        | 140/725 [4:40:02<27:27:40, 168.99s/it]

([2, 1, 5], [2, 1, 5])
tensor(6.6745, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0012239241041243076), (499999, 0.0)]


 19%|█▉        | 141/725 [4:40:37<20:53:10, 128.75s/it]

([5, 2, 1], [5, 2, 1])
tensor(7.3419, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 20%|█▉        | 142/725 [4:41:14<16:22:25, 101.11s/it]

([3, 0, 7], [3, 1, 7])
tensor(3.4708, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -0.7005597352981567), (499999, 0.0), (380927, 0.20937561988830566)]
([3, 0, 7], [3, 1, 7])
tensor(3.1874, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -2.412112236022949), (499999, 0.0), (380927, -4.64815616607666)]
([0, 0, 7], [3, 1, 7])
tensor(1.5221, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -0.9318068623542786), (499999, 0.0), (380927, 2.218414783477783)]
([0, 0, 0], [3, 1, 7])
tensor(0.4514, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -0.23811717331409454), (499999, 0.0), (380927, 2.460346221923828)]
([0, 0, 0], [3, 1, 7])
tensor(0.3084, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 0.28290683031082153), (499999, 0.0), (380927, 2.54495906829834)]
([0, 0, 0], [3, 1, 7])
tensor(0.2864, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 0.5948169827461243), (499999, 0.0), (380927, 2.623411178588867)]
([0, 0, 0], [3, 1, 7])
tensor(0.2787, device='

 20%|█▉        | 143/725 [4:47:03<28:22:19, 175.50s/it]

test：0.0, test mean: 0.23404255319148937
([0, 2, 0], [0, 2, 0])
tensor(4.3726, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.00040882735629566014), (405503, -0.005959148984402418)]
([0, 2, 0], [0, 2, 0])
tensor(4.6077, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0008176547125913203), (405503, 2.244682788848877)]
([0, 2, 0], [0, 2, 0])
tensor(4.3723, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0012264822144061327), (405503, 2.234037399291992)]
([0, 2, 0], [0, 2, 0])
tensor(4.3722, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0016353094251826406), (405503, 2.2233448028564453)]
([0, 2, 0], [0, 2, 0])
tensor(4.3721, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0020441371016204357), (405503, 2.2123191356658936)]
([0, 2, 0], [0, 2, 0])
tensor(4.3721, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0024529644288122654), (405503, 

 20%|█▉        | 144/725 [4:52:31<35:43:23, 221.35s/it]

test：1.0, test mean: 0.25
([0, 2, 1], [0, 2, 1])
tensor(5.7125, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 4.914683700008027e-07), (499999, 0.0), (499999, 0.0)]


 20%|██        | 145/725 [4:52:57<26:12:53, 162.71s/it]

([8, 8, 5], [8, 8, 8])
tensor(3.7733, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 20%|██        | 146/725 [4:53:24<19:37:11, 121.99s/it]

([2, 8, 2], [2, 5, 2])
tensor(9.7993, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 20%|██        | 147/725 [4:53:54<15:09:26, 94.41s/it] 

([2, 2, 0], [2, 0, 0])
tensor(5.6129, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 20%|██        | 148/725 [4:54:25<12:05:49, 75.47s/it]

([5, 7, 1], [5, 7, 1])
tensor(6.6004, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.5024181604385376), (380927, 0.21022090315818787), (499999, 0.0)]
([1, 7, 1], [5, 7, 1])
tensor(5.8288, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 1.1276146173477173), (380927, -9.797645568847656), (499999, 0.0)]
([0, 7, 1], [5, 7, 1])
tensor(3.7458, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 5.719825744628906), (380927, -4.080945014953613), (499999, 0.0)]
([0, 7, 1], [5, 7, 1])
tensor(2.0888, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 5.987676620483398), (380927, 0.3655526041984558), (499999, 0.0)]
([0, 0, 1], [5, 7, 1])
tensor(1.6408, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 6.152915000915527), (380927, -0.8267897367477417), (499999, 0.0)]
([0, 0, 1], [5, 7, 1])
tensor(1.5627, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 6.2713117599487305), (380927, -1.4537806510925293), (499999, 0.0)]
([0, 0, 1], [5, 7, 1])
tensor(1.5304, device=

 21%|██        | 149/725 [4:59:22<22:40:00, 141.67s/it]

test：0.0, test mean: 0.24489795918367346
([2, 8, 1], [2, 8, 1])
tensor(8.7170, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.00010672771895769984)]


 21%|██        | 150/725 [4:59:50<17:11:30, 107.64s/it]

([1, 1, 1], [1, 1, 1])
tensor(5.4685, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 21%|██        | 151/725 [5:00:17<13:19:36, 83.58s/it] 

([2, 5, 6], [2, 5, 6])
tensor(8.5791, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 21%|██        | 152/725 [5:00:47<10:44:56, 67.53s/it]

([1, 5, 2], [1, 5, 2])
tensor(7.7836, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.004030886106193066), (499999, 0.0), (499999, 0.0)]


 21%|██        | 153/725 [5:01:17<8:54:48, 56.10s/it] 

([5, 0, 2], [6, 0, 2])
tensor(4.0928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.02047356590628624), (499999, 0.0)]
([5, 0, 2], [6, 0, 2])
tensor(4.1814, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.3742971420288086), (499999, 0.0)]
([5, 0, 2], [6, 0, 2])
tensor(4.0494, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.19321340322494507), (499999, 0.0)]
([5, 0, 2], [6, 0, 2])
tensor(4.0426, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.12167291343212128), (499999, 0.0)]
([5, 0, 2], [6, 0, 2])
tensor(4.0415, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.06942429393529892), (499999, 0.0)]
([5, 0, 2], [6, 0, 2])
tensor(4.0407, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.03087359480559826), (499999, 0.0)]
([5, 0, 2], [6, 0, 2])
tensor(4.0405, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.0002583228051662

 21%|██        | 154/725 [5:06:15<20:24:57, 128.72s/it]

test：1.0, test mean: 0.26
([7, 5, 0], [7, 3, 7])
tensor(3.2137, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.3477725088596344), (275966, -0.07825746387243271), (499999, 0.0)]
([7, 0, 0], [7, 3, 7])
tensor(3.3875, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 1.1226422786712646), (275966, 3.601917266845703), (499999, 0.0)]
([7, 0, 0], [7, 3, 7])
tensor(1.8151, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 5.975650310516357), (275966, 4.356860160827637), (499999, 0.0)]
([0, 0, 0], [7, 3, 7])
tensor(0.3092, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 5.864195823669434), (275966, 4.364631652832031), (499999, 0.0)]
([0, 0, 0], [7, 3, 7])
tensor(0.3065, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 5.730541706085205), (275966, 4.3674845695495605), (499999, 0.0)]
([0, 0, 0], [7, 3, 7])
tensor(0.3043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 5.583395481109619), (275966, 4.398441314697266), (499999, 0.0)]
([0, 0, 0], [7, 3, 7])
te

 21%|██▏       | 155/725 [5:11:26<29:01:03, 183.27s/it]

test：0.0, test mean: 0.2549019607843137
([0, 2, 0], [0, 2, 0])
tensor(4.4099, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, 0.004038384184241295)]
([0, 2, 7], [0, 2, 0])
tensor(4.6966, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -2.0564639568328857)]
([0, 2, 0], [0, 2, 0])
tensor(4.3942, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -2.3555777072906494)]
([0, 2, 0], [0, 2, 0])
tensor(4.5502, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -0.2851671576499939)]
([0, 2, 0], [0, 2, 0])
tensor(4.3610, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -0.3083650469779968)]
([0, 2, 0], [0, 2, 0])
tensor(4.3634, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -0.3053635358810425)]
([0, 2, 0], [0, 2, 0])
tensor(4.3597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(4999

 22%|██▏       | 156/725 [5:16:23<34:24:07, 217.66s/it]

test：1.0, test mean: 0.2692307692307692
([2, 5, 7], [2, 5, 7])
tensor(8.2598, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.3589417040348053)]
([2, 5, 7], [2, 5, 7])
tensor(8.6064, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -1.3430665731430054)]
([2, 5, 7], [2, 5, 7])
tensor(6.7544, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 5.551878929138184)]
([2, 5, 7], [2, 5, 7])
tensor(4.8549, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.3112831115722656)]
([2, 5, 0], [2, 5, 7])
tensor(4.5703, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 3.0143566131591797)]
([2, 5, 0], [2, 5, 7])
tensor(4.5173, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.5258734226226807)]
([2, 5, 0], [2, 5, 7])
tensor(4.4970, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.

 22%|██▏       | 157/725 [5:22:08<40:22:21, 255.88s/it]

test：0.0, test mean: 0.2641509433962264
([8, 7, 3], [8, 7, 3])
tensor(4.4865, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.2073674201965332), (499999, 0.0)]
([8, 7, 3], [8, 7, 3])
tensor(4.5047, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -6.002110958099365), (499999, 0.0)]
([8, 7, 3], [8, 7, 3])
tensor(3.4266, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.3815178871154785), (499999, 0.0)]
([8, 0, 3], [8, 7, 3])
tensor(2.7372, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.5032801628112793), (499999, 0.0)]
([8, 0, 3], [8, 7, 3])
tensor(2.7363, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.6071197986602783), (499999, 0.0)]
([8, 0, 3], [8, 7, 3])
tensor(2.7353, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.698679208755493), (499999, 0.0)]
([8, 0, 3], [8, 7, 3])
tensor(2.7351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0)

 22%|██▏       | 158/725 [5:27:35<43:37:37, 277.00s/it]

test：0.0, test mean: 0.25925925925925924
([2, 2, 1], [2, 2, 1])
tensor(11.8052, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 22%|██▏       | 159/725 [5:28:00<31:41:58, 201.62s/it]

([1, 2, 8], [1, 2, 8])
tensor(7.4651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 22%|██▏       | 160/725 [5:28:25<23:19:23, 148.61s/it]

([1, 6, 2], [1, 6, 2])
tensor(8.0483, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 22%|██▏       | 161/725 [5:28:52<17:32:59, 112.02s/it]

([2, 1, 7], [2, 1, 0])
tensor(6.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([2, 1, 7], [2, 1, 0])
tensor(6.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([2, 1, 7], [2, 1, 0])
tensor(6.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([2, 1, 7], [2, 1, 0])
tensor(6.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([2, 1, 7], [2, 1, 0])
tensor(6.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([2, 1, 7], [2, 1, 0])
tensor(6.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([2, 1, 7], [2, 1, 0])
tensor(6.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([2, 1, 7], [2, 1, 0])
tensor(6.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0)

 22%|██▏       | 162/725 [5:33:50<26:15:29, 167.90s/it]

test：0.0, test mean: 0.2545454545454545
([5, 2, 5], [5, 2, 5])
tensor(5.9537, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.0016286701429635286)]


 22%|██▏       | 163/725 [5:34:19<19:42:36, 126.26s/it]

([5, 8, 0], [5, 5, 1])
tensor(2.8761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -0.043150559067726135)]
([5, 8, 1], [5, 5, 1])
tensor(3.1928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 1.5880744457244873)]
([5, 8, 0], [5, 5, 1])
tensor(2.8416, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 2.7076592445373535)]
([5, 8, 0], [5, 5, 1])
tensor(2.8239, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 2.385746479034424)]
([5, 8, 0], [5, 5, 1])
tensor(2.7376, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 2.589146137237549)]
([5, 8, 0], [5, 5, 1])
tensor(2.7333, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 2.712857961654663)]
([5, 8, 0], [5, 5, 1])
tensor(2.7314, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 2.83621215820

 23%|██▎       | 164/725 [5:39:20<27:48:05, 178.40s/it]

test：0.0, test mean: 0.25
([2, 7, 2], [2, 7, 2])
tensor(10.8329, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.20832374691963196), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(10.5578, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -6.76973819732666), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(9.7738, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.2072944641113281), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(9.0606, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.2866716384887695), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(9.0570, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.3272969722747803), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(9.0561, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.3637852668762207), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(9.0553, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 

 23%|██▎       | 165/725 [5:44:10<32:59:11, 212.06s/it]

test：0.0, test mean: 0.24561403508771928
([5, 0, 1], [5, 0, 1])
tensor(3.4026, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 23%|██▎       | 166/725 [5:44:36<24:15:31, 156.23s/it]

([6, 7, 1], [6, 7, 1])
tensor(7.2629, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21022023260593414), (499999, 0.0)]
([6, 7, 1], [6, 7, 1])
tensor(7.0781, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.797548294067383), (499999, 0.0)]
([6, 7, 1], [6, 7, 1])
tensor(5.4112, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.088521480560303), (499999, 0.0)]
([6, 7, 1], [6, 7, 1])
tensor(3.8710, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.0994915962219238), (499999, 0.0)]
([6, 0, 1], [6, 7, 1])
tensor(3.5063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.340364933013916), (499999, 0.0)]
([6, 0, 1], [6, 7, 1])
tensor(3.4685, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.350093960762024), (499999, 0.0)]
([6, 0, 1], [6, 7, 1])
tensor(3.4518, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.6830644607543945), (499

 23%|██▎       | 167/725 [5:49:08<29:36:09, 190.98s/it]

test：0.0, test mean: 0.2413793103448276
([8, 8, 7], [8, 8, 7])
tensor(5.1233, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -0.35845524072647095)]
([8, 8, 7], [8, 8, 7])
tensor(5.5428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.424577474594116)]
([8, 8, 7], [8, 8, 7])
tensor(4.1984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 9.345376014709473)]
([8, 8, 0], [8, 8, 7])
tensor(3.0465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 9.437032699584961)]
([8, 8, 0], [8, 8, 7])
tensor(3.0465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 9.541444778442383)]
([8, 8, 0], [8, 8, 7])
tensor(3.0455, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 9.62060546875)]
([8, 8, 0], [8, 8, 7])
tensor(3.0451, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499

 23%|██▎       | 168/725 [5:54:13<34:51:12, 225.27s/it]

test：0.0, test mean: 0.23728813559322035
([2, 2, 5], [2, 2, 5])
tensor(9.7572, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 23%|██▎       | 169/725 [5:54:37<25:26:31, 164.73s/it]

([5, 8, 8], [5, 8, 8])
tensor(3.9633, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -1.2282100215088576e-05), (499999, 0.0), (499999, 0.0)]


 23%|██▎       | 170/725 [5:54:59<18:48:41, 122.02s/it]

([0, 1, 8], [0, 1, 8])
tensor(3.8368, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.0006929663359187543)]


 24%|██▎       | 171/725 [5:55:23<14:13:29, 92.44s/it] 

([2, 1, 1], [2, 1, 1])
tensor(9.3848, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 24%|██▎       | 172/725 [5:55:50<11:11:12, 72.83s/it]

([6, 7, 1], [6, 7, 1])
tensor(4.1593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 24%|██▍       | 173/725 [5:56:17<9:03:37, 59.09s/it] 

([7, 7, 7], [7, 7, 7])
tensor(5.5679, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.34793218970298767), (499999, 0.0), (499999, 0.0)]
([7, 7, 7], [7, 7, 7])
tensor(5.8994, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.38583335280418396), (499999, 0.0), (499999, 0.0)]
([7, 7, 7], [7, 7, 7])
tensor(4.0800, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.127925872802734), (499999, 0.0), (499999, 0.0)]
([7, 7, 7], [7, 7, 7])
tensor(2.1805, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.4557425379753113), (499999, 0.0), (499999, 0.0)]
([0, 7, 7], [7, 7, 7])
tensor(1.9418, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.272977352142334), (499999, 0.0), (499999, 0.0)]
([0, 7, 7], [7, 7, 7])
tensor(1.8963, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.5934091210365295), (499999, 0.0), (499999, 0.0)]
([0, 7, 7], [7, 7, 7])
tensor(1.8782, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.2826669216156006), (499999, 0.0), (4999

 24%|██▍       | 174/725 [6:01:30<20:42:15, 135.27s/it]

test：0.0, test mean: 0.23333333333333334
([6, 8, 2], [6, 8, 2])
tensor(6.7651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 24%|██▍       | 175/725 [6:02:00<15:50:22, 103.68s/it]

([2, 2, 0], [2, 2, 0])
tensor(7.2604, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 24%|██▍       | 176/725 [6:02:32<12:31:32, 82.14s/it] 

([7, 0, 3], [7, 0, 7])
tensor(3.3432, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.20970755815505981), (499999, 0.0), (499999, 0.0)]
([7, 0, 3], [7, 0, 7])
tensor(3.2873, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.9051756858825684), (499999, 0.0), (499999, 0.0)]
([7, 0, 3], [7, 0, 7])
tensor(2.0677, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.84891939163208), (499999, 0.0), (499999, 0.0)]
([0, 0, 3], [7, 0, 7])
tensor(0.9229, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9283511638641357), (499999, 0.0), (499999, 0.0)]
([0, 0, 3], [7, 0, 7])
tensor(0.9202, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.987347364425659), (499999, 0.0), (499999, 0.0)]
([0, 0, 3], [7, 0, 7])
tensor(0.9188, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 3.0339527130126953), (499999, 0.0), (499999, 0.0)]
([0, 0, 3], [7, 0, 7])
tensor(0.9178, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 3.071807384490967), (499999, 0.0), (499999, 

 24%|██▍       | 177/725 [6:08:02<23:50:15, 156.60s/it]

test：0.0, test mean: 0.22950819672131148
([7, 8, 5], [7, 8, 5])
tensor(4.9324, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21018540859222412), (499999, 0.0), (499999, 0.0)]
([7, 8, 5], [7, 8, 5])
tensor(4.9219, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.091545104980469), (499999, 0.0), (499999, 0.0)]
([7, 8, 5], [7, 8, 5])
tensor(3.3554, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.907001495361328), (499999, 0.0), (499999, 0.0)]
([0, 8, 5], [7, 8, 5])
tensor(2.0258, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.695780277252197), (499999, 0.0), (499999, 0.0)]
([0, 8, 5], [7, 8, 5])
tensor(2.0081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.895008087158203), (499999, 0.0), (499999, 0.0)]
([0, 8, 5], [7, 8, 5])
tensor(2.0028, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.014434814453125), (499999, 0.0), (499999, 0.0)]
([0, 8, 5], [7, 8, 5])
tensor(1.9976, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -

 25%|██▍       | 178/725 [6:13:10<30:41:25, 201.98s/it]

test：0.0, test mean: 0.22580645161290322
([2, 1, 1], [2, 1, 1])
tensor(11.3917, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 25%|██▍       | 179/725 [6:13:34<22:33:44, 148.76s/it]

([1, 2, 1], [1, 2, 1])
tensor(5.3619, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 25%|██▍       | 180/725 [6:14:00<16:55:45, 111.83s/it]

([1, 7, 7], [1, 7, 7])
tensor(6.7128, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.22123022377490997), (499999, 0.0)]
([1, 7, 7], [1, 7, 7])
tensor(6.7396, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.037315368652344), (499999, 0.0)]
([1, 7, 7], [1, 7, 7])
tensor(4.9825, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.783357620239258), (499999, 0.0)]
([1, 7, 7], [1, 7, 7])
tensor(3.3625, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -7.779278755187988), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(3.0071, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -8.284786224365234), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(2.9565, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -8.87623405456543), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(2.9247, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.724459648132324), (499999

 25%|██▍       | 181/725 [6:19:03<25:34:06, 169.20s/it]

test：0.0, test mean: 0.2222222222222222
([2, 7, 3], [2, 7, 3])
tensor(9.5160, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.35893166065216064), (499999, 0.0)]
([2, 7, 3], [2, 7, 3])
tensor(9.8378, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.916163444519043), (499999, 0.0)]
([2, 7, 3], [2, 7, 3])
tensor(8.1790, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 10.651100158691406), (499999, 0.0)]
([2, 0, 3], [2, 7, 3])
tensor(6.5182, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 10.233491897583008), (499999, 0.0)]
([2, 0, 3], [2, 7, 3])
tensor(6.4611, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 9.497564315795898), (499999, 0.0)]
([2, 0, 3], [2, 7, 3])
tensor(6.4470, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 8.973923683166504), (499999, 0.0)]
([2, 0, 3], [2, 7, 3])
tensor(6.4410, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0)

 25%|██▌       | 182/725 [6:23:53<30:59:02, 205.42s/it]

test：0.0, test mean: 0.21875
([5, 1, 0], [5, 1, 0])
tensor(3.3836, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 25%|██▌       | 183/725 [6:24:19<22:49:20, 151.59s/it]

([5, 0, 5], [5, 0, 1])
tensor(1.5625, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, 0.018892724066972733), (499999, 0.0)]
([5, 7, 5], [5, 0, 1])
tensor(1.8274, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, 0.12247049808502197), (499999, 0.0)]
([5, 0, 5], [5, 0, 1])
tensor(1.5309, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, 0.3418205976486206), (499999, 0.0)]
([5, 0, 5], [5, 0, 1])
tensor(1.6087, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, 3.8234310150146484), (499999, 0.0)]
([5, 0, 5], [5, 0, 1])
tensor(1.4855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, 3.846937894821167), (499999, 0.0)]
([5, 0, 5], [5, 0, 1])
tensor(1.4854, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, 3.868380546569824), (499999, 0.0)]
([5, 0, 5], [5, 0, 1])
tensor(1.4852, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, 3.8858447074890137), (49999

 25%|██▌       | 184/725 [6:29:06<28:53:19, 192.24s/it]

test：1.0, test mean: 0.23076923076923078
([5, 0, 1], [3, 0, 1])
tensor(2.8121, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▌       | 185/725 [6:29:32<21:21:52, 142.43s/it]

([6, 8, 2], [6, 4, 2])
tensor(7.7909, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▌       | 186/725 [6:29:59<16:06:09, 107.55s/it]

([8, 0, 1], [8, 1, 1])
tensor(2.3794, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.4358409345149994), (499999, 0.0), (499999, 0.0)]
([7, 0, 1], [8, 1, 1])
tensor(2.3789, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.697566032409668), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [8, 1, 1])
tensor(1.6319, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.65312123298645), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [8, 1, 1])
tensor(1.6287, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.624075412750244), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [8, 1, 1])
tensor(1.6280, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.6087799072265625), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [8, 1, 1])
tensor(1.6269, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.59378719329834), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [8, 1, 1])
tensor(1.6266, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 3.6100666522979736), (499999, 0.0), (499999, 0.0)

 26%|██▌       | 187/725 [6:34:40<23:52:10, 159.72s/it]

test：0.0, test mean: 0.22727272727272727
([0, 2, 5], [0, 2, 1])
tensor(4.8445, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▌       | 188/725 [6:35:08<17:55:00, 120.11s/it]

([0, 8, 0], [0, 8, 0])
tensor(1.7728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▌       | 189/725 [6:35:35<13:43:46, 92.21s/it] 

([2, 5, 1], [2, 0, 1])
tensor(5.4885, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▌       | 190/725 [6:36:01<10:45:49, 72.43s/it]

([2, 0, 8], [2, 7, 8])
tensor(4.4449, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▋       | 191/725 [6:36:27<8:39:31, 58.37s/it] 

([7, 1, 8], [7, 1, 8])
tensor(6.7837, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21019206941127777), (499999, 0.0), (499999, 0.0)]
([7, 1, 8], [7, 1, 8])
tensor(6.7613, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.091986656188965), (499999, 0.0), (499999, 0.0)]
([7, 1, 8], [7, 1, 8])
tensor(5.1991, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.902533531188965), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(3.8429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.8134965896606445), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(3.8165, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.030231475830078), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(3.8099, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.15861701965332), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(3.8065, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.247251987457275), (499999, 0.0), (49999

 26%|██▋       | 192/725 [6:41:14<18:49:44, 127.18s/it]

test：0.0, test mean: 0.22388059701492538
([2, 7, 2], [2, 7, 2])
tensor(9.6551, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21018296480178833), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(9.7957, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.092253684997559), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(8.2195, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.8944091796875), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(6.8267, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.792412757873535), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(6.8010, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.001131057739258), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(6.7940, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.104905605316162), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(6.7903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0

 27%|██▋       | 193/725 [6:46:03<25:57:07, 175.62s/it]

test：0.0, test mean: 0.22058823529411764
([2, 0, 1], [2, 0, 1])
tensor(8.4869, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 27%|██▋       | 194/725 [6:46:30<19:19:59, 131.07s/it]

([1, 5, 0], [1, 5, 1])
tensor(3.5804, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 27%|██▋       | 195/725 [6:46:55<14:36:42, 99.25s/it] 

([3, 1, 1], [3, 1, 1])
tensor(5.6356, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 27%|██▋       | 196/725 [6:47:21<11:20:20, 77.17s/it]

([0, 0, 2], [0, 0, 2])
tensor(4.1939, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.007585838437080383), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(4.3263, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -2.5976552963256836), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(4.1869, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -2.765012264251709), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(4.2890, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -1.8121774196624756), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(4.1706, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -1.842434287071228), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(4.1719, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -1.8776061534881592), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(4.1701, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -1.9020366668701172), (499999, 0.0), (

 27%|██▋       | 197/725 [6:51:59<20:09:42, 137.47s/it]

test：1.0, test mean: 0.2318840579710145
([1, 1, 7], [1, 1, 7])
tensor(6.6970, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 8.080941188381985e-05), (499999, 0.0), (380927, 0.21020588278770447)]
([1, 1, 7], [1, 1, 7])
tensor(6.6222, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0001616188237676397), (499999, 0.0), (380927, -7.355119228363037)]
([1, 1, 7], [1, 1, 7])
tensor(5.2029, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0002424281556159258), (499999, 0.0), (380927, 0.46474093198776245)]
([1, 1, 0], [1, 1, 7])
tensor(3.6430, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0003232376475352794), (499999, 0.0), (380927, 3.1339364051818848)]
([1, 1, 0], [1, 1, 7])
tensor(3.5197, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.000404047139454633), (499999, 0.0), (380927, 4.248035907745361)]
([1, 1, 0], [1, 1, 7])
tensor(3.4965, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00048485613660886884), (499999, 0.0), (380927, 4.815078

 27%|██▋       | 198/725 [6:56:53<27:01:12, 184.58s/it]

test：0.0, test mean: 0.22857142857142856
([7, 1, 2], [7, 1, 2])
tensor(5.3744, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 27%|██▋       | 199/725 [6:57:17<19:55:09, 136.33s/it]

([5, 7, 1], [5, 7, 1])
tensor(6.9654, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21011996269226074), (499999, 0.0)]
([5, 7, 1], [5, 7, 1])
tensor(6.7858, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.1396164894104), (499999, 0.0)]
([5, 7, 1], [5, 7, 1])
tensor(5.4374, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.2164777517318726), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.9353, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.9965194463729858), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.8281, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.229886531829834), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.8150, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.207744598388672), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.8066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.141892433166504), (499999, 0.

 28%|██▊       | 200/725 [7:01:38<25:20:11, 173.74s/it]

test：0.0, test mean: 0.22535211267605634
([0, 2, 2], [0, 2, 2])
tensor(9.4153, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 28%|██▊       | 201/725 [7:02:04<18:50:14, 129.42s/it]

([0, 1, 5], [0, 1, 8])
tensor(3.5788, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 28%|██▊       | 202/725 [7:02:29<14:13:53, 97.96s/it] 

([5, 5, 3], [5, 0, 7])
tensor(2.8500, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0030708620324730873), (499999, 0.0), (499999, -0.0024483990855515003)]


 28%|██▊       | 203/725 [7:02:54<11:03:25, 76.26s/it]

([1, 1, 2], [1, 1, 2])
tensor(6.6224, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 28%|██▊       | 204/725 [7:03:19<8:46:28, 60.63s/it] 

([1, 2, 8], [0, 2, 5])
tensor(6.1905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 28%|██▊       | 205/725 [7:03:41<7:06:18, 49.19s/it]

([8, 5, 5], [8, 5, 5])
tensor(4.1927, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 7.04123465311568e-07), (499999, 0.0)]


 28%|██▊       | 206/725 [7:04:03<5:55:27, 41.09s/it]

([1, 2, 2], [1, 2, 2])
tensor(13.2925, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.002027725800871849), (499999, 0.0), (499999, 0.0)]


 29%|██▊       | 207/725 [7:04:26<5:07:32, 35.62s/it]

([2, 2, 0], [2, 2, 0])
tensor(8.7494, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 29%|██▊       | 208/725 [7:04:50<4:37:13, 32.17s/it]

([7, 1, 8], [7, 1, 8])
tensor(7.2845, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.2102048397064209), (499999, 0.0), (499999, 0.0)]
([7, 1, 8], [7, 1, 8])
tensor(7.2767, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -8.814412117004395), (499999, 0.0), (499999, 0.0)]
([7, 1, 8], [7, 1, 8])
tensor(5.7806, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.737804412841797), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(4.3096, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.609004020690918), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(4.1736, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.759920120239258), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(4.1522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.7416458129882812), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(4.1346, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.7375831604003906), (499999, 0.0), (4999

 29%|██▉       | 209/725 [7:09:13<14:31:57, 101.39s/it]

test：0.0, test mean: 0.2222222222222222
([5, 2, 1], [1, 2, 1])
tensor(5.7197, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 29%|██▉       | 210/725 [7:09:36<11:08:41, 77.90s/it] 

([0, 2, 2], [0, 2, 2])
tensor(9.4605, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 29%|██▉       | 211/725 [7:10:00<8:48:26, 61.69s/it] 

([0, 1, 8], [7, 1, 8])
tensor(4.3085, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.011086281388998032), (499999, 0.0)]


 29%|██▉       | 212/725 [7:10:24<7:11:03, 50.42s/it]

([2, 0, 2], [2, 0, 2])
tensor(7.2931, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.007408368866890669), (499999, 0.0)]


 29%|██▉       | 213/725 [7:10:48<6:03:20, 42.58s/it]

([0, 8, 2], [1, 8, 2])
tensor(8.4067, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|██▉       | 214/725 [7:11:14<5:19:55, 37.56s/it]

([2, 1, 1], [2, 1, 1])
tensor(10.3332, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|██▉       | 215/725 [7:11:38<4:44:46, 33.50s/it]

([1, 1, 6], [1, 1, 6])
tensor(4.6461, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|██▉       | 216/725 [7:12:02<4:19:28, 30.59s/it]

([2, 5, 2], [2, 0, 2])
tensor(7.5805, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|██▉       | 217/725 [7:12:26<4:01:57, 28.58s/it]

([2, 6, 5], [2, 6, 5])
tensor(9.9853, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|███       | 218/725 [7:12:51<3:52:57, 27.57s/it]

([6, 1, 8], [6, 1, 8])
tensor(5.3696, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|███       | 219/725 [7:13:15<3:43:39, 26.52s/it]

([5, 1, 1], [5, 1, 1])
tensor(5.3795, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.5041796565055847), (499999, 0.0), (499999, 9.568534005666152e-05)]
([1, 1, 1], [5, 1, 1])
tensor(4.5025, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 5.582158088684082), (499999, 0.0), (499999, 0.00019137068011332303)]
([0, 1, 1], [5, 1, 1])
tensor(3.7335, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 15.616979598999023), (499999, 0.0), (499999, 0.000287056143861264)]
([0, 1, 1], [5, 1, 1])
tensor(3.5352, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 15.601287841796875), (499999, 0.0), (499999, 0.00038274136022664607)]
([0, 1, 1], [5, 1, 1])
tensor(3.5333, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 15.587054252624512), (499999, 0.0), (499999, 0.0004784268676303327)]
([0, 1, 1], [5, 1, 1])
tensor(3.5326, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 15.573246955871582), (499999, 0.0), (499999, 0.0005741121713072062)]
([0, 1, 1], [5, 1, 1])
tens

 30%|███       | 220/725 [7:17:32<13:24:19, 95.56s/it]

test：0.0, test mean: 0.2191780821917808
([5, 7, 1], [5, 7, 1])
tensor(6.2224, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00393382553011179), (380927, 0.21019810438156128), (499999, 0.010360832326114178)]
([5, 7, 1], [5, 7, 1])
tensor(6.3107, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00786765106022358), (380927, -9.092628479003906), (499999, 0.020721664652228355)]
([5, 7, 1], [5, 7, 1])
tensor(4.7657, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.011801476590335369), (380927, -4.873405933380127), (499999, 0.031082501634955406)]
([5, 0, 1], [5, 7, 1])
tensor(3.2341, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.01573530212044716), (380927, -4.659365653991699), (499999, 0.04144332930445671)]
([5, 0, 1], [5, 7, 1])
tensor(3.1945, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.019669128581881523), (380927, -5.063767433166504), (499999, 0.05180417001247406)]
([5, 0, 1], [5, 7, 1])
tensor(3.1811, device='cuda:0', grad_fn=<NllLo

 30%|███       | 221/725 [7:21:50<20:12:13, 144.31s/it]

test：0.0, test mean: 0.21621621621621623
([3, 2, 1], [3, 2, 1])
tensor(8.0296, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -0.8032412528991699), (499999, 0.0), (499999, 0.0)]
([3, 2, 1], [3, 2, 1])
tensor(7.9270, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -1.3047466278076172), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [3, 2, 1])
tensor(7.1578, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -1.0475149154663086), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [3, 2, 1])
tensor(7.0135, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -1.5395598411560059), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [3, 2, 1])
tensor(6.9771, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -1.5984739065170288), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [3, 2, 1])
tensor(6.9571, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -1.6729401350021362), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [3, 2, 1])
tensor(6.9554, device='cuda:0', grad_fn=<NllLossBackward0>)
[(2211

 31%|███       | 222/725 [7:26:05<24:47:37, 177.45s/it]

test：0.0, test mean: 0.21333333333333335
([8, 1, 1], [8, 1, 1])
tensor(5.9115, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 31%|███       | 223/725 [7:26:30<18:21:55, 131.70s/it]

([6, 0, 6], [6, 0, 6])
tensor(3.5758, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 31%|███       | 224/725 [7:26:54<13:49:54, 99.39s/it] 

([2, 8, 7], [2, 8, 0])
tensor(5.3066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -0.269227534532547)]
([2, 8, 7], [2, 8, 0])
tensor(5.6320, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 6.86204719543457)]
([2, 8, 0], [2, 8, 0])
tensor(5.0979, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 9.63802719116211)]
([2, 8, 0], [2, 8, 0])
tensor(5.1819, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 11.658390998840332)]
([2, 8, 0], [2, 8, 0])
tensor(4.9410, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 11.773638725280762)]
([2, 8, 0], [2, 8, 0])
tensor(4.9404, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 11.852681159973145)]
([2, 8, 0], [2, 8, 0])
tensor(4.9401, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 11.92861175537109

 31%|███       | 225/725 [7:31:21<20:48:35, 149.83s/it]

test：1.0, test mean: 0.2236842105263158
([2, 6, 2], [2, 6, 2])
tensor(8.7998, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 31%|███       | 226/725 [7:31:45<15:31:29, 112.00s/it]

([6, 5, 0], [6, 5, 0])
tensor(2.1380, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0004923061933368444), (499999, 0.0)]


 31%|███▏      | 227/725 [7:32:09<11:49:52, 85.53s/it] 

([2, 0, 2], [2, 0, 2])
tensor(7.9052, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 31%|███▏      | 228/725 [7:32:33<9:15:51, 67.11s/it] 

([1, 1, 2], [1, 1, 2])
tensor(6.2255, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 32%|███▏      | 229/725 [7:32:58<7:29:58, 54.43s/it]

([8, 2, 1], [8, 2, 1])
tensor(9.7194, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 32%|███▏      | 230/725 [7:33:24<6:19:41, 46.02s/it]

([2, 5, 7], [2, 3, 7])
tensor(6.9517, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 3.1215248107910156), (380927, 0.21021980047225952)]
([2, 1, 7], [2, 3, 7])
tensor(6.8855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 9.685964584350586), (380927, -9.79750919342041)]
([2, 0, 7], [2, 3, 7])
tensor(4.7698, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 11.13455581665039), (380927, -4.088637351989746)]
([2, 0, 7], [2, 3, 7])
tensor(3.1979, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 11.276965141296387), (380927, -0.9628764390945435)]
([2, 0, 0], [2, 3, 7])
tensor(2.8364, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 11.281824111938477), (380927, -1.1021459102630615)]
([2, 0, 0], [2, 3, 7])
tensor(2.7960, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 11.271020889282227), (380927, -1.0480492115020752)]
([2, 0, 0], [2, 3, 7])
tensor(2.7752, device=

 32%|███▏      | 231/725 [7:38:07<16:03:44, 117.05s/it]

test：0.0, test mean: 0.22077922077922077
([5, 2, 8], [5, 2, 8])
tensor(5.7829, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 32%|███▏      | 232/725 [7:38:32<12:14:28, 89.39s/it] 

([6, 2, 7], [6, 2, 0])
tensor(5.6939, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 0.5175716280937195)]
([6, 2, 7], [6, 2, 0])
tensor(6.1267, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 2.267995834350586)]
([6, 2, 0], [6, 2, 0])
tensor(5.5027, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 3.991302728652954)]
([6, 2, 7], [6, 2, 0])
tensor(6.3504, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 2.5585131645202637)]
([6, 2, 0], [6, 2, 0])
tensor(5.3125, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 2.638005018234253)]
([6, 2, 0], [6, 2, 0])
tensor(5.3123, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 2.7152140140533447)]
([6, 2, 0], [6, 2, 0])
tensor(5.3122, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 2.78955078125)]


 32%|███▏      | 233/725 [7:42:53<19:15:58, 140.97s/it]

test：1.0, test mean: 0.23076923076923078
([2, 7, 1], [2, 7, 1])
tensor(5.9023, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 32%|███▏      | 234/725 [7:43:17<14:25:47, 105.80s/it]

([5, 7, 0], [2, 7, 0])
tensor(3.0571, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.20700594782829285), (499999, 0.0)]
([5, 7, 0], [2, 7, 0])
tensor(2.8015, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.5847264528274536), (499999, 0.0)]
([5, 7, 0], [2, 7, 0])
tensor(2.0717, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.616915702819824), (499999, 0.0)]
([5, 0, 0], [2, 7, 0])
tensor(1.3685, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.822180271148682), (499999, 0.0)]
([5, 0, 0], [2, 7, 0])
tensor(1.3650, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.959234237670898), (499999, 0.0)]
([5, 0, 0], [2, 7, 0])
tensor(1.3644, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.089482307434082), (499999, 0.0)]
([5, 0, 0], [2, 7, 0])
tensor(1.3640, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.217215538024902), (499999, 0

 32%|███▏      | 235/725 [7:47:35<20:37:01, 151.47s/it]

test：0.0, test mean: 0.22784810126582278
([7, 5, 2], [7, 5, 2])
tensor(9.2404, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21018435060977936), (499999, -0.0028719687834382057), (499999, 0.0)]
([7, 5, 2], [7, 5, 2])
tensor(9.3372, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.09209156036377), (499999, -0.0057439375668764114), (499999, 0.0)]
([7, 5, 2], [7, 5, 2])
tensor(7.7878, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.892983436584473), (499999, -0.008615905418992043), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(6.3859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.646259307861328), (499999, -0.011487875133752823), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(6.3564, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.9279890060424805), (499999, -0.014359842985868454), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(6.3481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.059304237365723), (499999, -0.017231812700629234),

 33%|███▎      | 236/725 [7:51:54<24:58:23, 183.85s/it]

test：0.0, test mean: 0.225
([7, 3, 1], [7, 3, 1])
tensor(5.8344, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.20977288484573364), (499999, 0.0), (499999, 0.0)]
([7, 3, 1], [7, 3, 1])
tensor(5.7442, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.149081230163574), (499999, 0.0), (499999, 0.0)]
([7, 3, 1], [7, 3, 1])
tensor(4.2055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.224745750427246), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [7, 3, 1])
tensor(3.5024, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.12448787689209), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [7, 3, 1])
tensor(3.5015, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.085467338562012), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [7, 3, 1])
tensor(3.5010, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.069795608520508), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [7, 3, 1])
tensor(3.5005, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.0534648895263

 33%|███▎      | 237/725 [7:56:18<28:09:47, 207.76s/it]

test：0.0, test mean: 0.2222222222222222
([0, 5, 1], [0, 8, 1])
tensor(2.9761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 33%|███▎      | 238/725 [7:56:43<20:42:02, 153.02s/it]

([5, 7, 7], [5, 7, 7])
tensor(7.4681, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2102017104625702), (380927, 0.21020060777664185)]
([5, 7, 7], [5, 7, 7])
tensor(7.4541, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.092561721801758), (380927, -9.07313346862793)]
([5, 7, 7], [5, 7, 7])
tensor(4.3098, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.882675647735596), (380927, -3.9179835319519043)]
([5, 0, 0], [5, 7, 7])
tensor(1.4603, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.417880535125732), (380927, -3.0242412090301514)]
([5, 0, 0], [5, 7, 7])
tensor(1.2885, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.498686790466309), (380927, -2.9422450065612793)]
([5, 0, 0], [5, 7, 7])
tensor(1.2664, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.50743293762207), (380927, -2.879077196121216)]
([5, 0, 0], [5, 7, 7])
tensor(1.2570, device

 33%|███▎      | 239/725 [8:01:19<25:38:37, 189.95s/it]

test：0.0, test mean: 0.21951219512195122
([0, 8, 2], [0, 8, 2])
tensor(5.8447, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 33%|███▎      | 240/725 [8:01:44<18:55:34, 140.48s/it]

([5, 2, 2], [5, 2, 2])
tensor(7.4893, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 33%|███▎      | 241/725 [8:02:08<14:09:46, 105.34s/it]

([0, 0, 1], [0, 0, 1])
tensor(2.8618, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 33%|███▎      | 242/725 [8:02:31<10:49:31, 80.69s/it] 

([5, 5, 0], [1, 1, 0])
tensor(2.4244, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 34%|███▎      | 243/725 [8:02:55<8:32:10, 63.76s/it] 

([2, 0, 1], [2, 0, 1])
tensor(9.5063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.00017641880549490452), (499999, 0.0)]


 34%|███▎      | 244/725 [8:03:18<6:51:45, 51.36s/it]

([5, 2, 2], [5, 2, 2])
tensor(12.1552, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 34%|███▍      | 245/725 [8:03:41<5:42:52, 42.86s/it]

([7, 1, 5], [7, 1, 5])
tensor(5.9368, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.20985642075538635), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 5])
tensor(5.8554, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.150140285491943), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 5])
tensor(4.5528, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.133349895477295), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(3.4568, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.1770846843719482), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(3.4520, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.1906685829162598), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(3.4500, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.2082679271698), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(3.4446, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.122842311859131), (499999, 0.0), (499999, 0.

 34%|███▍      | 246/725 [8:07:59<14:17:46, 107.45s/it]

test：0.0, test mean: 0.21686746987951808
([2, 0, 8], [2, 7, 8])
tensor(6.6434, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 34%|███▍      | 247/725 [8:08:22<10:53:39, 82.05s/it] 

([2, 5, 7], [2, 5, 7])
tensor(9.3379, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.21018880605697632)]
([2, 5, 7], [2, 5, 7])
tensor(9.3589, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -9.181600570678711)]
([2, 5, 7], [2, 5, 7])
tensor(7.8132, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -5.020386219024658)]
([2, 5, 0], [2, 5, 7])
tensor(6.4379, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.9358649253845215)]
([2, 5, 0], [2, 5, 7])
tensor(6.4081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -5.150291442871094)]
([2, 5, 0], [2, 5, 7])
tensor(6.4013, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -5.234658241271973)]
([2, 5, 0], [2, 5, 7])
tensor(6.3985, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -5.33012008

 34%|███▍      | 248/725 [8:13:02<18:44:24, 141.44s/it]

test：0.0, test mean: 0.21428571428571427
([5, 2, 5], [5, 2, 5])
tensor(7.3164, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 34%|███▍      | 249/725 [8:13:27<14:05:01, 106.52s/it]

([8, 5, 2], [8, 5, 2])
tensor(7.1469, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 7.616821676492691e-06), (499999, 0.0)]


 34%|███▍      | 250/725 [8:13:51<10:48:05, 81.86s/it] 

([8, 5, 1], [8, 2, 1])
tensor(3.2366, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 35%|███▍      | 251/725 [8:14:15<8:30:42, 64.65s/it] 

([1, 3, 1], [1, 3, 1])
tensor(5.7401, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 35%|███▍      | 252/725 [8:14:41<6:56:29, 52.83s/it]

([2, 7, 0], [2, 7, 1])
tensor(6.2445, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21019169688224792), (499999, 0.0)]
([2, 7, 0], [2, 7, 1])
tensor(6.2866, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.092182159423828), (499999, 0.0)]
([2, 7, 0], [2, 7, 1])
tensor(4.7277, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.894147872924805), (499999, 0.0)]
([2, 0, 0], [2, 7, 1])
tensor(3.3261, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.856865406036377), (499999, 0.0)]
([2, 0, 0], [2, 7, 1])
tensor(3.2982, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.117480278015137), (499999, 0.0)]
([2, 0, 0], [2, 7, 1])
tensor(3.2908, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.225140571594238), (499999, 0.0)]
([2, 0, 0], [2, 7, 1])
tensor(3.2874, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.326409339904785), (49999

 35%|███▍      | 253/725 [8:18:59<15:01:14, 114.57s/it]

test：0.0, test mean: 0.21176470588235294
([3, 1, 1], [5, 1, 1])
tensor(5.8293, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 35%|███▌      | 254/725 [8:19:23<11:25:55, 87.38s/it] 

([7, 2, 5], [7, 2, 5])
tensor(9.3964, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21020564436912537), (499999, 0.0), (499999, 0.0)]
([7, 2, 5], [7, 2, 5])
tensor(9.3028, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -8.814373016357422), (499999, 0.0), (499999, 0.0)]
([7, 2, 5], [7, 2, 5])
tensor(7.7354, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.621085166931152), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [7, 2, 5])
tensor(6.2285, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.552401542663574), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [7, 2, 5])
tensor(6.1908, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.63361930847168), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [7, 2, 5])
tensor(6.1854, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.654354095458984), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [7, 2, 5])
tensor(6.1819, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.640586853027344), (499999, 0.0), (499999

 35%|███▌      | 255/725 [8:23:44<18:12:20, 139.45s/it]

test：0.0, test mean: 0.20930232558139536
([1, 8, 2], [1, 8, 2])
tensor(6.8365, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.44854244589805603), (499999, 0.0)]
([1, 1, 2], [1, 8, 2])
tensor(6.8149, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 8.784796714782715), (499999, 0.0)]
([1, 0, 2], [1, 8, 2])
tensor(6.1214, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 9.153197288513184), (499999, 0.0)]
([1, 0, 2], [1, 8, 2])
tensor(6.1080, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 9.309450149536133), (499999, 0.0)]
([1, 0, 2], [1, 8, 2])
tensor(6.1046, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 9.422560691833496), (499999, 0.0)]
([1, 0, 2], [1, 8, 2])
tensor(6.1036, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 9.534135818481445), (499999, 0.0)]
([1, 0, 2], [1, 8, 2])
tensor(6.1028, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), 

 35%|███▌      | 256/725 [8:28:03<22:49:22, 175.19s/it]

test：0.0, test mean: 0.20689655172413793
([0, 0, 1], [7, 0, 1])
tensor(2.7616, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 35%|███▌      | 257/725 [8:28:27<16:52:25, 129.80s/it]

([2, 2, 2], [2, 2, 2])
tensor(13.4492, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▌      | 258/725 [8:28:52<12:46:40, 98.50s/it] 

([7, 5, 1], [7, 5, 1])
tensor(4.2608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▌      | 259/725 [8:29:17<9:53:31, 76.42s/it] 

([2, 2, 8], [2, 2, 8])
tensor(8.2581, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▌      | 260/725 [8:29:41<7:49:36, 60.59s/it]

([0, 2, 5], [0, 2, 5])
tensor(4.9478, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▌      | 261/725 [8:30:03<6:20:20, 49.18s/it]

([0, 5, 1], [7, 7, 1])
tensor(3.8813, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▌      | 262/725 [8:30:26<5:19:27, 41.40s/it]

([8, 7, 8], [8, 0, 8])
tensor(3.6432, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0032173546496778727), (499999, 0.0), (499999, 0.0)]


 36%|███▋      | 263/725 [8:30:50<4:37:38, 36.06s/it]

([3, 2, 0], [3, 2, 0])
tensor(4.6907, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▋      | 264/725 [8:31:14<4:08:20, 32.32s/it]

([7, 5, 0], [7, 1, 0])
tensor(1.8956, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 37%|███▋      | 265/725 [8:31:38<3:49:11, 29.90s/it]

([5, 7, 2], [5, 7, 2])
tensor(7.6422, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2211897373199463), (499999, 0.0)]
([5, 7, 2], [5, 7, 2])
tensor(7.6712, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -8.224255561828613), (499999, 0.0)]
([5, 7, 2], [5, 7, 2])
tensor(5.9917, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.9071900844573975), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(4.6182, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.77396297454834), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(4.5291, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.202088356018066), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(4.5092, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.474089622497559), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(4.5045, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.7127227783203125), (49999

 37%|███▋      | 266/725 [8:37:03<15:06:21, 118.48s/it]

test：0.0, test mean: 0.20454545454545456
([6, 8, 8], [6, 8, 8])
tensor(3.1732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 37%|███▋      | 267/725 [8:37:38<11:52:10, 93.30s/it] 

([6, 1, 5], [6, 1, 0])
tensor(3.5000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 37%|███▋      | 268/725 [8:38:05<9:20:05, 73.54s/it] 

([7, 2, 0], [7, 2, 0])
tensor(7.7684, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.22122889757156372), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(7.8674, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.87950325012207), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(6.1726, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.8534722328186035), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(4.5837, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.134761095046997), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(4.2171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.02239847183227539), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(4.1658, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.0209332704544067), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(4.1354, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.539723515510559), (499999, 0.0), (4999

 37%|███▋      | 269/725 [8:43:54<19:47:09, 156.21s/it]

test：0.0, test mean: 0.20224719101123595
([5, 0, 2], [5, 0, 2])
tensor(5.0491, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 37%|███▋      | 270/725 [8:44:26<15:00:55, 118.80s/it]

([7, 1, 1], [7, 1, 0])
tensor(5.0585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21022087335586548), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 0])
tensor(5.0143, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.797624588012695), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 0])
tensor(3.4182, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.084017276763916), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 0])
tensor(1.8435, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.120924472808838), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 0])
tensor(1.3847, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.7823395729064941), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 0])
tensor(1.3390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.7354319095611572), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 0])
tensor(1.3149, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.857954978942871), (499999, 0.0), (499

 37%|███▋      | 271/725 [8:50:02<23:13:13, 184.13s/it]

test：0.0, test mean: 0.2
([2, 0, 2], [2, 0, 2])
tensor(8.6074, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 272/725 [8:50:35<17:27:09, 138.70s/it]

([2, 5, 0], [2, 2, 7])
tensor(6.5458, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.00011921395343961194)]


 38%|███▊      | 273/725 [8:51:15<13:41:28, 109.05s/it]

([1, 5, 1], [1, 5, 1])
tensor(5.4422, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0016286701429635286), (499999, 0.0)]


 38%|███▊      | 274/725 [8:51:45<10:42:52, 85.53s/it] 

([0, 8, 8], [0, 8, 8])
tensor(2.8901, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 275/725 [8:52:17<8:39:14, 69.23s/it] 

([1, 1, 2], [1, 1, 2])
tensor(8.0853, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 276/725 [8:52:51<7:20:21, 58.85s/it]

([3, 2, 6], [3, 2, 6])
tensor(7.4017, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 277/725 [8:53:21<6:14:31, 50.16s/it]

([1, 2, 2], [1, 2, 2])
tensor(12.5484, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 278/725 [8:53:50<5:27:18, 43.93s/it]

([2, 7, 1], [2, 7, 1])
tensor(7.0895, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 279/725 [8:54:20<4:55:14, 39.72s/it]

([8, 7, 5], [8, 7, 5])
tensor(6.1673, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2101925015449524), (499999, 0.0)]
([8, 7, 5], [8, 7, 5])
tensor(6.1403, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.09196662902832), (499999, 0.0)]
([8, 7, 5], [8, 7, 5])
tensor(4.5875, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.0330891609191895), (499999, 0.0)]
([8, 0, 5], [8, 7, 5])
tensor(3.2918, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.055456161499023), (499999, 0.0)]
([8, 0, 5], [8, 7, 5])
tensor(3.2211, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.307776927947998), (499999, 0.0)]
([8, 0, 5], [8, 7, 5])
tensor(3.2081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.447144031524658), (499999, 0.0)]
([8, 0, 5], [8, 7, 5])
tensor(3.2018, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.570062637329102), (499999

 39%|███▊      | 280/725 [8:59:38<15:12:17, 123.00s/it]

test：0.0, test mean: 0.1978021978021978
([2, 8, 7], [2, 8, 7])
tensor(7.7884, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -1.5452096704393625e-05), (380927, 0.21019314229488373)]
([2, 8, 7], [2, 8, 7])
tensor(7.7595, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -3.090419340878725e-05), (380927, -9.09206771850586)]
([2, 8, 7], [2, 8, 7])
tensor(6.2026, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -4.635588265955448e-05), (380927, -4.898709297180176)]
([2, 8, 0], [2, 8, 7])
tensor(4.8353, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -6.18083868175745e-05), (380927, -4.8098063468933105)]
([2, 8, 0], [2, 8, 7])
tensor(4.8088, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -7.725972682237625e-05), (380927, -5.016939640045166)]
([2, 8, 0], [2, 8, 7])
tensor(4.8018, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -9.271129965782166e-05), (380927, -5

 39%|███▉      | 281/725 [9:05:22<23:21:27, 189.39s/it]

test：0.0, test mean: 0.1956521739130435
([2, 2, 2], [2, 2, 2])
tensor(12.3353, device='cuda:0', grad_fn=<NllLossBackward0>)


 39%|███▉      | 282/725 [9:05:52<17:25:43, 141.63s/it]

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([5, 2, 2], [6, 2, 2])
tensor(6.7412, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 39%|███▉      | 283/725 [9:06:21<13:13:52, 107.77s/it]

([5, 7, 7], [5, 7, 7])
tensor(6.8147, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -3.8253605453064665e-05), (499999, 0.0), (380927, -0.3589414656162262)]
([5, 7, 7], [5, 7, 7])
tensor(7.1800, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -7.650721090612933e-05), (499999, 0.0), (380927, -1.3430659770965576)]
([5, 7, 7], [5, 7, 7])
tensor(5.2964, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00011476081272121519), (499999, 0.0), (380927, 6.142195224761963)]
([5, 7, 7], [5, 7, 7])
tensor(3.4365, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00015301442181225866), (499999, 0.0), (380927, 5.203095436096191)]
([5, 7, 0], [5, 7, 7])
tensor(3.1772, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00019126804545521736), (499999, 0.0), (380927, 7.002910137176514)]
([5, 7, 0], [5, 7, 7])
tensor(3.1209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00022952159633859992), (499999, 0.0), (380927, 8.429134368896484)]
([5, 7, 0], [5, 7, 

 39%|███▉      | 284/725 [9:12:03<21:48:55, 178.09s/it]

test：0.0, test mean: 0.1935483870967742
([2, 7, 8], [2, 7, 8])
tensor(10.4346, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21012556552886963), (499999, 0.0)]
([2, 7, 8], [2, 7, 8])
tensor(10.2702, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.139566421508789), (499999, 0.0)]
([2, 7, 8], [2, 7, 8])
tensor(8.9125, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.2115552425384521), (499999, 0.0)]
([2, 0, 8], [2, 7, 8])
tensor(7.4010, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.6101075410842896), (499999, 0.0)]
([2, 0, 8], [2, 7, 8])
tensor(7.2783, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.1350467205047607), (499999, 0.0)]
([2, 0, 8], [2, 7, 8])
tensor(7.2635, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.8882937431335449), (499999, 0.0)]
([2, 0, 8], [2, 7, 8])
tensor(7.2564, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 39%|███▉      | 285/725 [9:17:27<27:07:13, 221.89s/it]

test：0.0, test mean: 0.19148936170212766
([2, 2, 7], [2, 2, 7])
tensor(14.0119, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.2102203369140625)]
([2, 2, 7], [2, 2, 7])
tensor(13.8219, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -8.995711326599121)]
([2, 2, 7], [2, 2, 7])
tensor(12.1244, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -3.118302583694458)]
([2, 2, 7], [2, 2, 7])
tensor(10.6033, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -1.630919337272644)]
([2, 2, 0], [2, 2, 7])
tensor(10.2878, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -2.2064175605773926)]
([2, 2, 0], [2, 2, 7])
tensor(10.2242, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -1.4521149396896362)]
([2, 2, 0], [2, 2, 7])
tensor(10.2047, device='cuda:0', grad_fn=<NllLossBackward0>)
[(4

 39%|███▉      | 286/725 [9:23:11<31:31:20, 258.50s/it]

test：0.0, test mean: 0.18947368421052632
([2, 0, 0], [2, 0, 0])
tensor(3.1819, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0001145025307778269), (499999, 0.0)]


 40%|███▉      | 287/725 [9:23:40<23:03:57, 189.58s/it]

([7, 0, 2], [7, 7, 2])
tensor(10.3644, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.22122976183891296), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 7, 2])
tensor(10.4033, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.601780891418457), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 7, 2])
tensor(8.7441, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.1108109951019287), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 7, 2])
tensor(7.1325, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.5525696277618408), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(6.7362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.0601072311401367), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(6.6697, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.7017934322357178), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(6.6519, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.3680602312088013), (499999, 0.0),

 40%|███▉      | 288/725 [9:28:58<27:41:50, 228.17s/it]

test：0.0, test mean: 0.1875
([1, 2, 3], [1, 2, 3])
tensor(7.5240, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 0.0027498379349708557)]
([1, 2, 3], [1, 2, 3])
tensor(7.1830, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 6.199255466461182)]
([1, 2, 3], [1, 2, 3])
tensor(7.1351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 13.279478073120117)]
([1, 2, 0], [1, 2, 3])
tensor(6.4307, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 12.538196563720703)]
([1, 2, 0], [1, 2, 3])
tensor(6.4053, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 12.499938011169434)]
([1, 2, 0], [1, 2, 3])
tensor(6.4020, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 12.526800155639648)]
([1, 2, 0], [1, 2, 3])
tensor(6.3990, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999

 40%|███▉      | 289/725 [9:34:19<30:59:35, 255.91s/it]

test：0.0, test mean: 0.18556701030927836
([1, 2, 1], [1, 2, 1])
tensor(7.8827, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 40%|████      | 290/725 [9:34:49<22:43:38, 188.09s/it]

([8, 7, 1], [8, 0, 1])
tensor(5.1219, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 40%|████      | 291/725 [9:35:16<16:52:17, 139.95s/it]

([7, 6, 8], [7, 6, 0])
tensor(3.9043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 40%|████      | 292/725 [9:35:46<12:51:43, 106.94s/it]

([1, 0, 1], [1, 0, 1])
tensor(5.3808, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -0.007091423496603966), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(5.3860, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -0.3857923746109009), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(5.3807, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -0.3969978392124176), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(5.3850, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -0.5891835689544678), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(5.3786, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -0.6016104221343994), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(5.3807, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -0.7750943899154663), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(5.3780, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -0.7846858501434326)

 40%|████      | 293/725 [9:40:16<18:40:57, 155.69s/it]

test：1.0, test mean: 0.19387755102040816
([0, 2, 7], [0, 2, 7])
tensor(7.0680, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.04980456829071045), (499999, 0.0), (380927, 0.21018444001674652)]
([0, 2, 7], [0, 2, 7])
tensor(7.0959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.14892953634262085), (499999, 0.0), (380927, -9.9860258102417)]
([0, 2, 7], [0, 2, 7])
tensor(5.4007, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.3267841041088104), (499999, 0.0), (380927, -7.533359050750732)]
([0, 2, 0], [0, 2, 7])
tensor(4.0146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.5514967441558838), (499999, 0.0), (380927, -7.545135974884033)]
([0, 2, 0], [0, 2, 7])
tensor(3.8965, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.6386645436286926), (499999, 0.0), (380927, -7.665031909942627)]
([0, 2, 0], [0, 2, 7])
tensor(3.9178, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -1.8974684476852417), (499999, 0.0), (380927, -7.71811056137085)]

 41%|████      | 294/725 [9:45:00<23:15:29, 194.27s/it]

test：0.5, test mean: 0.19696969696969696
([3, 2, 1], [7, 2, 1])
tensor(6.4005, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 41%|████      | 295/725 [9:45:25<17:09:21, 143.63s/it]

([8, 3, 2], [8, 3, 2])
tensor(5.7670, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 41%|████      | 296/725 [9:45:49<12:49:52, 107.68s/it]

([6, 2, 5], [6, 2, 6])
tensor(5.4590, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.0029125965666025877)]


 41%|████      | 297/725 [9:46:13<9:49:41, 82.67s/it]  

([1, 1, 1], [1, 1, 1])
tensor(6.0236, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0014855768531560898), (499999, 0.0), (499999, 0.0)]


 41%|████      | 298/725 [9:46:38<7:44:45, 65.31s/it]

([8, 8, 7], [8, 8, 7])
tensor(7.4107, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.21022139489650726)]
([8, 8, 7], [8, 8, 7])
tensor(7.3321, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -10.878239631652832)]
([8, 8, 7], [8, 8, 7])
tensor(5.7076, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -5.907857418060303)]
([8, 8, 7], [8, 8, 7])
tensor(4.0922, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.503276824951172)]
([8, 8, 0], [8, 8, 7])
tensor(3.6641, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.124682426452637)]
([8, 8, 0], [8, 8, 7])
tensor(3.6192, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.095889091491699)]
([8, 8, 0], [8, 8, 7])
tensor(3.6007, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.23504447

 41%|████      | 299/725 [9:51:26<15:38:10, 132.14s/it]

test：0.0, test mean: 0.195
([1, 2, 5], [1, 2, 5])
tensor(5.7182, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 41%|████▏     | 300/725 [9:51:52<11:49:19, 100.14s/it]

([3, 5, 8], [5, 1, 8])
tensor(3.1197, device='cuda:0', grad_fn=<NllLossBackward0>)


 42%|████▏     | 301/725 [9:52:17<9:07:59, 77.55s/it]  

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([1, 7, 3], [1, 7, 3])
tensor(4.3526, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 42%|████▏     | 302/725 [9:52:42<7:15:22, 61.76s/it]

([2, 2, 2], [2, 2, 2])
tensor(13.0828, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 42%|████▏     | 303/725 [9:53:07<5:57:42, 50.86s/it]

([5, 3, 1], [0, 3, 1])
tensor(2.7327, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.013641688972711563)]


 42%|████▏     | 304/725 [9:53:31<5:00:19, 42.80s/it]

([2, 6, 7], [2, 6, 7])
tensor(9.4324, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.22119398415088654)]
([2, 6, 7], [2, 6, 7])
tensor(9.5670, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.211884498596191)]
([2, 6, 7], [2, 6, 7])
tensor(7.9491, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -3.320665121078491)]
([2, 6, 0], [2, 6, 7])
tensor(6.5606, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -3.549140453338623)]
([2, 6, 0], [2, 6, 7])
tensor(6.5448, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -3.8339834213256836)]
([2, 6, 0], [2, 6, 7])
tensor(6.5391, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.019503593444824)]
([2, 6, 0], [2, 6, 7])
tensor(6.5352, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.16820335

 42%|████▏     | 305/725 [9:58:15<13:25:40, 115.10s/it]

test：0.0, test mean: 0.19306930693069307
([0, 7, 2], [0, 7, 2])
tensor(8.0373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.01603703945875168), (380927, -0.3479323983192444), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(8.6086, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -1.0597293376922607), (380927, 0.3464279770851135), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(6.6373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -1.2349649667739868), (380927, 6.304797649383545), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(4.8656, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -2.2974038124084473), (380927, 6.81464147567749), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(4.3619, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -2.34812331199646), (380927, 9.071083068847656), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(4.3223, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -2.3038320541381836), (380927, 10.09038257598877), (499999, 0.0)]
([0,

 42%|████▏     | 306/725 [10:02:42<18:43:09, 160.83s/it]

test：0.5, test mean: 0.19607843137254902
([6, 1, 1], [6, 1, 1])
tensor(4.6450, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 42%|████▏     | 307/725 [10:03:09<13:59:16, 120.47s/it]

([2, 2, 1], [2, 2, 1])
tensor(8.3600, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 42%|████▏     | 308/725 [10:03:33<10:37:29, 91.73s/it] 

([0, 5, 0], [0, 5, 0])
tensor(0.9097, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, 0.0027539506554603577), (499999, 0.0), (393215, 0.0023297322914004326)]
([0, 5, 0], [0, 5, 0])
tensor(1.3550, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -7.960679054260254), (499999, 0.0), (393215, -1.3461767435073853)]
([0, 5, 0], [0, 5, 0])
tensor(0.9014, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -8.040706634521484), (499999, 0.0), (393215, -1.516660213470459)]
([0, 5, 0], [0, 5, 0])
tensor(1.2488, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -16.599061965942383), (499999, 0.0), (393215, -1.1391642093658447)]
([0, 5, 0], [0, 5, 0])
tensor(0.8856, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -16.665958404541016), (499999, 0.0), (393215, -1.099897861480713)]
([0, 5, 0], [0, 5, 0])
tensor(1.0901, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -25.041038513183594), (499999, 0.0), (393215, -0.8618787527084351)]
([0, 5, 0], [0, 5, 0])
tensor(0.88

 43%|████▎     | 309/725 [10:08:13<17:06:11, 148.01s/it]

test：1.0, test mean: 0.20388349514563106
([1, 7, 3], [1, 7, 3])
tensor(6.3292, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.3479250371456146), (454655, -0.8441708087921143)]
([1, 7, 3], [1, 7, 3])
tensor(6.6101, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.6590749025344849), (454655, -1.8896095752716064)]
([1, 7, 8], [1, 7, 3])
tensor(3.8631, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.5422016382217407), (454655, -2.2167599201202393)]
([1, 0, 0], [1, 7, 3])
tensor(1.8919, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.8510600328445435), (454655, -1.0152771472930908)]
([1, 0, 0], [1, 7, 3])
tensor(1.7928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.1421563625335693), (454655, -0.7599728107452393)]
([1, 0, 0], [1, 7, 3])
tensor(1.7802, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.3172667026519775), (454655, -0.6185344457626

 43%|████▎     | 310/725 [10:12:43<21:18:08, 184.79s/it]

test：0.0, test mean: 0.20192307692307693
([8, 0, 0], [8, 0, 0])
tensor(1.2423, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.003742142813280225), (479231, -0.00428758654743433)]
([8, 0, 0], [8, 0, 0])
tensor(1.2907, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.0016052653081715107), (479231, 1.8443541526794434)]
([8, 0, 0], [8, 0, 0])
tensor(1.2386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.0010504921665415168), (479231, 1.9599274396896362)]
([8, 0, 0], [8, 0, 0])
tensor(1.2747, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.003687445539981127), (479231, 2.2232675552368164)]
([8, 0, 0], [8, 0, 0])
tensor(1.2318, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.0007345464546233416), (479231, 2.280567169189453)]
([8, 0, 0], [8, 0, 0])
tensor(1.2411, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.0008298647589981556), (479231, 1.95777

 43%|████▎     | 311/725 [10:17:11<24:07:55, 209.84s/it]

test：1.0, test mean: 0.20952380952380953
([1, 2, 0], [1, 2, 1])
tensor(6.4853, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 43%|████▎     | 312/725 [10:17:37<17:44:44, 154.68s/it]

([2, 1, 0], [2, 1, 0])
tensor(4.6973, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.009031040593981743), (405503, 0.0195174440741539)]
([2, 1, 7], [2, 1, 0])
tensor(5.3670, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.018062081187963486), (405503, -9.895767211914062)]
([2, 1, 0], [2, 1, 0])
tensor(4.6601, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.027093123644590378), (405503, -9.925872802734375)]
([2, 1, 0], [2, 1, 0])
tensor(4.6599, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.03612416237592697), (405503, -9.955025672912598)]
([2, 1, 0], [2, 1, 0])
tensor(4.6597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.045155201107263565), (405503, -9.984111785888672)]
([2, 1, 0], [2, 1, 0])
tensor(4.6596, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.054186247289180756), (405503, -10.01268196105957)]
([2, 1, 0], [2, 1, 0])
tensor(4.659

 43%|████▎     | 313/725 [10:22:25<22:16:21, 194.62s/it]

test：1.0, test mean: 0.2169811320754717
([0, 5, 5], [3, 5, 5])
tensor(2.8316, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.06228826195001602), (499999, 0.0), (499999, 0.0)]
([7, 5, 5], [3, 5, 5])
tensor(3.1822, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -1.1914851665496826), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [3, 5, 5])
tensor(2.6425, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 5.030116081237793), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [3, 5, 5])
tensor(2.5863, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 6.427517414093018), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [3, 5, 5])
tensor(2.4961, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 6.523484230041504), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [3, 5, 5])
tensor(2.4957, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 6.593175888061523), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [3, 5, 5])
tensor(2.4956, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 6.65

 43%|████▎     | 314/725 [10:26:51<24:39:23, 215.97s/it]

test：0.0, test mean: 0.21495327102803738
([2, 0, 2], [2, 0, 2])
tensor(9.6384, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 0.02494441531598568), (499999, 0.0)]
([2, 1, 2], [2, 0, 2])
tensor(9.9548, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.7191599607467651), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(9.6383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.7531461715698242), (499999, 0.0)]
([2, 1, 2], [2, 0, 2])
tensor(9.9548, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.4972505569458008), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(9.6380, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.5308659076690674), (499999, 0.0)]
([2, 1, 2], [2, 0, 2])
tensor(9.9511, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -2.441450595855713), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(9.6374, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49999

 43%|████▎     | 315/725 [10:31:22<26:28:06, 232.41s/it]

test：0.0, test mean: 0.21296296296296297
([8, 1, 1], [8, 1, 1])
tensor(6.6858, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 44%|████▎     | 316/725 [10:31:48<19:23:08, 170.63s/it]

([1, 1, 1], [1, 1, 1])
tensor(6.2310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 44%|████▎     | 317/725 [10:32:15<14:27:31, 127.58s/it]

([3, 2, 2], [5, 2, 2])
tensor(9.7185, device='cuda:0', grad_fn=<NllLossBackward0>)
[(159743, -0.34748589992523193), (499999, 0.0), (499999, 0.0)]
([3, 2, 2], [5, 2, 2])
tensor(9.5731, device='cuda:0', grad_fn=<NllLossBackward0>)
[(159743, 3.6417908668518066), (499999, 0.0), (499999, 0.0)]
([3, 2, 2], [5, 2, 2])
tensor(9.4675, device='cuda:0', grad_fn=<NllLossBackward0>)
[(159743, 15.198789596557617), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(8.9701, device='cuda:0', grad_fn=<NllLossBackward0>)
[(159743, 13.865928649902344), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(8.9438, device='cuda:0', grad_fn=<NllLossBackward0>)
[(159743, 13.7138032913208), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(8.9425, device='cuda:0', grad_fn=<NllLossBackward0>)
[(159743, 13.63518238067627), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [5, 2, 2])
tensor(8.9420, device='cuda:0', grad_fn=<NllLossBackward0>)
[(159743, 13.657293319702148), (499999, 0.0), (499999,

 44%|████▍     | 318/725 [10:36:40<19:03:47, 168.62s/it]

test：0.0, test mean: 0.21100917431192662
([8, 2, 1], [1, 2, 1])
tensor(8.9815, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 44%|████▍     | 319/725 [10:37:06<14:11:03, 125.77s/it]

([2, 7, 2], [2, 7, 2])
tensor(10.2894, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2102210819721222), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(10.0954, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.797606468200684), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(8.4213, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.086902618408203), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(6.8516, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.22112131118774414), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(6.4980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.6950027346611023), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(6.4268, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.039132833480835), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(6.4132, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.1249219179153442), (4

 44%|████▍     | 320/725 [10:41:37<19:03:55, 169.47s/it]

test：0.0, test mean: 0.20909090909090908
([2, 1, 6], [2, 1, 6])
tensor(10.5121, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 44%|████▍     | 321/725 [10:42:04<14:12:42, 126.64s/it]

([5, 2, 0], [6, 2, 0])
tensor(5.9348, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 3.621301584644243e-05)]


 44%|████▍     | 322/725 [10:42:28<10:45:08, 96.05s/it] 

([7, 2, 5], [7, 2, 5])
tensor(7.6792, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21022075414657593), (499999, 0.0), (491519, -0.2639002203941345)]
([7, 2, 8], [7, 2, 5])
tensor(7.0807, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.797561645507812), (499999, 0.0), (491519, 2.048466205596924)]
([7, 2, 0], [7, 2, 5])
tensor(4.9850, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.0856709480285645), (499999, 0.0), (491519, 2.0615334510803223)]
([7, 2, 0], [7, 2, 5])
tensor(3.4062, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.07236379384994507), (499999, 0.0), (491519, 2.0797805786132812)]
([0, 2, 0], [7, 2, 5])
tensor(3.0214, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.9696006774902344), (499999, 0.0), (491519, 2.026871681213379)]
([0, 2, 0], [7, 2, 5])
tensor(2.9703, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.1817930936813354), (499999, 0.0), (491519, 1.982020378112793)]
([0, 2, 0], [7, 2, 5])
tensor(2.9479, devic

 45%|████▍     | 323/725 [10:47:01<16:38:32, 149.03s/it]

test：0.0, test mean: 0.2072072072072072
([1, 0, 5], [1, 0, 0])
tensor(2.7916, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 45%|████▍     | 324/725 [10:47:27<12:29:17, 112.11s/it]

([5, 6, 1], [5, 6, 1])
tensor(5.6386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 45%|████▍     | 325/725 [10:47:53<9:34:30, 86.18s/it]  

([8, 7, 1], [8, 7, 1])
tensor(6.5574, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.35891613364219666), (499999, 0.0)]
([8, 7, 1], [8, 7, 1])
tensor(6.9562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.6801073551177979), (499999, 0.0)]
([8, 7, 1], [8, 7, 1])
tensor(5.2584, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.871626853942871), (499999, 0.0)]
([8, 0, 1], [8, 7, 1])
tensor(3.5349, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.9637603759765625), (499999, 0.0)]
([8, 0, 1], [8, 7, 1])
tensor(3.4870, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.145809173583984), (499999, 0.0)]
([8, 0, 1], [8, 7, 1])
tensor(3.4612, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.256089210510254), (499999, 0.0)]
([8, 0, 1], [8, 7, 1])
tensor(3.4511, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.250489711761475), (499999,

 45%|████▍     | 326/725 [10:52:21<15:36:26, 140.82s/it]

test：0.0, test mean: 0.20535714285714285
([5, 0, 8], [5, 0, 8])
tensor(2.7861, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.09272000193595886), (499999, 0.0)]
([5, 0, 8], [5, 0, 8])
tensor(2.9669, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.9895918965339661), (499999, 0.0)]
([5, 0, 8], [5, 0, 8])
tensor(2.7708, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 1.9182604551315308), (499999, 0.0)]
([5, 7, 8], [5, 0, 8])
tensor(3.2957, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 1.5561248064041138), (499999, 0.0)]
([5, 0, 8], [5, 0, 8])
tensor(2.7436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 1.5560336112976074), (499999, 0.0)]
([5, 0, 8], [5, 0, 8])
tensor(2.7436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 1.5559426546096802), (499999, 0.0)]
([5, 0, 8], [5, 0, 8])
tensor(2.7436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0

 45%|████▌     | 327/725 [10:56:46<19:41:24, 178.10s/it]

test：1.0, test mean: 0.21238938053097345
([0, 1, 1], [0, 1, 1])
tensor(4.0691, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 45%|████▌     | 328/725 [10:57:11<14:34:21, 132.14s/it]

([1, 2, 2], [1, 2, 2])
tensor(11.2653, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 2.3054184566717595e-05)]


 45%|████▌     | 329/725 [10:57:36<10:59:20, 99.90s/it] 

([2, 1, 2], [2, 1, 2])
tensor(9.8586, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.008054624311625957), (499999, 0.0)]


 46%|████▌     | 330/725 [10:58:01<8:29:30, 77.39s/it] 

([7, 1, 1], [7, 1, 1])
tensor(6.9700, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.34793218970298767), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(7.3131, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.659944772720337), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(5.3840, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.5552260875701904), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(3.4664, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.463606834411621), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(3.2914, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.570650100708008), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(3.2419, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.316178798675537), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(3.2324, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.824790954589844), (499999, 0.0), (4999

 46%|████▌     | 331/725 [11:02:26<14:38:11, 133.73s/it]

test：0.0, test mean: 0.21052631578947367
([7, 0, 2], [7, 0, 2])
tensor(5.5406, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.20992952585220337), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(5.5173, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.859113693237305), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(4.1888, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.5789194107055664), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(3.0758, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.600721836090088), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(3.0728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.6412651538848877), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(3.0704, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.6769464015960693), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(3.0693, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927

 46%|████▌     | 332/725 [11:07:01<19:13:20, 176.08s/it]

test：0.0, test mean: 0.20869565217391303
([0, 6, 5], [0, 6, 5])
tensor(2.6496, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 1.3142942407284863e-05), (499999, 0.0), (499999, 0.0)]


 46%|████▌     | 333/725 [11:07:28<14:19:42, 131.59s/it]

([8, 2, 5], [8, 2, 3])
tensor(6.6597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 46%|████▌     | 334/725 [11:07:58<10:57:07, 100.84s/it]

([1, 1, 2], [1, 1, 2])
tensor(8.8930, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.0004088293353561312)]


 46%|████▌     | 335/725 [11:08:23<8:29:26, 78.38s/it]  

([1, 8, 0], [1, 8, 0])
tensor(3.5740, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 46%|████▋     | 336/725 [11:08:52<6:50:28, 63.31s/it]

([2, 1, 7], [2, 1, 7])
tensor(8.0122, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.35893964767456055)]
([2, 1, 7], [2, 1, 7])
tensor(8.3429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 1.9161059856414795)]
([2, 1, 7], [2, 1, 7])
tensor(6.6574, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 10.653934478759766)]
([2, 1, 0], [2, 1, 7])
tensor(4.9638, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 9.892333030700684)]
([2, 1, 0], [2, 1, 7])
tensor(4.8936, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 9.25996208190918)]
([2, 1, 0], [2, 1, 7])
tensor(4.8812, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 8.880172729492188)]
([2, 1, 0], [2, 1, 7])
tensor(4.8752, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 8.6119308471679

 46%|████▋     | 337/725 [11:14:07<14:57:34, 138.80s/it]

test：0.0, test mean: 0.20689655172413793
([2, 0, 0], [2, 0, 0])
tensor(4.3038, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.0007948391139507294), (499999, 0.0)]
([2, 7, 0], [2, 0, 0])
tensor(4.6573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -2.7800495624542236), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(4.2690, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -3.222308397293091), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(4.3068, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -4.007287979125977), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(4.2185, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -4.069364547729492), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(4.2157, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -4.148383617401123), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(4.2136, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999

 47%|████▋     | 338/725 [11:18:50<19:35:12, 182.20s/it]

test：1.0, test mean: 0.21367521367521367
([0, 1, 0], [0, 1, 7])
tensor(2.0367, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0014679154846817255), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 339/725 [11:19:19<14:36:58, 136.32s/it]

([8, 0, 2], [8, 1, 2])
tensor(7.9140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 340/725 [11:19:49<11:09:16, 104.30s/it]

([2, 1, 2], [2, 1, 2])
tensor(11.0130, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 341/725 [11:20:17<8:42:07, 81.58s/it]  

([0, 2, 2], [0, 2, 2])
tensor(10.3002, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 342/725 [11:20:41<6:50:24, 64.29s/it]

([0, 7, 6], [0, 7, 6])
tensor(4.7506, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21019253134727478), (499999, 0.0)]
([0, 7, 6], [0, 7, 6])
tensor(4.7525, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -8.813886642456055), (499999, 0.0)]
([0, 7, 6], [0, 7, 6])
tensor(3.1952, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.63690185546875), (499999, 0.0)]
([0, 0, 6], [0, 7, 6])
tensor(1.7920, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.523754119873047), (499999, 0.0)]
([0, 0, 6], [0, 7, 6])
tensor(1.7700, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.534038543701172), (499999, 0.0)]
([0, 0, 6], [0, 7, 6])
tensor(1.7660, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.532527446746826), (499999, 0.0)]
([0, 0, 6], [0, 7, 6])
tensor(1.7635, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.475948333740234), (499999

 47%|████▋     | 343/725 [11:25:40<14:15:59, 134.45s/it]

test：0.0, test mean: 0.211864406779661
([7, 2, 2], [7, 2, 2])
tensor(7.5941, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 344/725 [11:26:05<10:45:19, 101.63s/it]

([2, 7, 1], [2, 7, 1])
tensor(7.4034, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.3479326367378235), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(7.7059, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.565205991268158), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(5.9416, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.519139289855957), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(3.9969, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 3.8385658264160156), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(3.7609, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 3.837027072906494), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(3.7149, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 3.1533451080322266), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(3.6882, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.5779013633728027), (499999,

 48%|████▊     | 345/725 [11:30:45<16:22:54, 155.20s/it]

test：0.0, test mean: 0.21008403361344538
([1, 2, 7], [1, 2, 7])
tensor(8.6714, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 48%|████▊     | 346/725 [11:31:09<12:11:28, 115.80s/it]

([1, 8, 1], [1, 8, 1])
tensor(4.9038, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.6188340783119202), (499999, 0.0), (499999, 0.0)]
([1, 8, 1], [1, 8, 1])
tensor(4.9164, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.9905052185058594), (499999, 0.0), (499999, 0.0)]
([1, 8, 1], [1, 8, 1])
tensor(4.8842, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -1.5124226808547974), (499999, 0.0), (499999, 0.0)]
([1, 8, 1], [1, 8, 1])
tensor(4.8543, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -3.6195735931396484), (499999, 0.0), (499999, 0.0)]
([1, 8, 1], [1, 8, 1])
tensor(4.8773, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -3.955831527709961), (499999, 0.0), (499999, 0.0)]
([1, 8, 1], [1, 8, 1])
tensor(4.8362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -4.580676555633545), (499999, 0.0), (499999, 0.0)]
([1, 8, 1], [1, 8, 1])
tensor(4.8142, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -5.367546558380127), (499999, 0.0), (49

 48%|████▊     | 347/725 [11:35:53<17:27:11, 166.22s/it]

test：1.0, test mean: 0.21666666666666667
([5, 7, 8], [7, 7, 8])
tensor(4.2500, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.20870010554790497), (499999, 0.0)]
([5, 7, 8], [7, 7, 8])
tensor(4.3670, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -11.393675804138184), (499999, 0.0)]
([5, 7, 8], [7, 7, 8])
tensor(2.9695, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -6.415450572967529), (499999, 0.0)]
([5, 0, 8], [7, 7, 8])
tensor(2.1474, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -6.5581464767456055), (499999, 0.0)]
([5, 0, 8], [7, 7, 8])
tensor(2.1461, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -6.673562049865723), (499999, 0.0)]
([5, 0, 8], [7, 7, 8])
tensor(2.1449, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -6.771337032318115), (499999, 0.0)]
([5, 0, 8], [7, 7, 8])
tensor(2.1449, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999,

 48%|████▊     | 348/725 [11:40:35<21:03:22, 201.07s/it]

test：0.0, test mean: 0.21487603305785125
([0, 2, 1], [5, 2, 1])
tensor(7.8874, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -0.27913522720336914), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(7.9789, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, 6.431872367858887), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(7.8109, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, 7.8165812492370605), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(7.9361, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, 11.12890338897705), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(7.7749, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, 11.13326644897461), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(7.7748, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, 11.136737823486328), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(7.7748, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, 11.

 48%|████▊     | 349/725 [11:45:20<23:37:35, 226.21s/it]

test：0.0, test mean: 0.21311475409836064
([1, 2, 2], [1, 2, 2])
tensor(8.3243, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.002906334586441517), (499999, 0.0), (499999, 0.0)]


 48%|████▊     | 350/725 [11:45:47<17:21:01, 166.56s/it]

([0, 7, 7], [0, 7, 7])
tensor(5.9348, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.003728835377842188), (368639, 0.20836883783340454), (380927, 0.2102205455303192)]
([0, 7, 7], [0, 7, 7])
tensor(5.8428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.4377278983592987), (368639, -8.639389991760254), (380927, -9.519414901733398)]
([0, 7, 7], [0, 7, 7])
tensor(2.6456, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.44579264521598816), (368639, -7.159819602966309), (380927, -4.02519416809082)]
([0, 0, 7], [0, 7, 7])
tensor(0.6117, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.6452412605285645), (368639, -6.95913028717041), (380927, -1.368409514427185)]
([0, 0, 0], [0, 7, 7])
tensor(0.1919, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.6420092582702637), (368639, -6.835816383361816), (380927, -1.5369340181350708)]
([0, 0, 0], [0, 7, 7])
tensor(0.1170, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.6418008208274841), (368639

 48%|████▊     | 351/725 [11:50:59<21:50:11, 210.19s/it]

test：0.3333333333333333, test mean: 0.2140921409214092
([1, 0, 2], [1, 0, 2])
tensor(8.3845, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 49%|████▊     | 352/725 [11:51:29<16:10:24, 156.10s/it]

([1, 2, 2], [1, 2, 2])
tensor(9.6103, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 49%|████▊     | 353/725 [11:51:57<12:09:24, 117.65s/it]

([6, 2, 8], [6, 2, 8])
tensor(7.4120, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 49%|████▉     | 354/725 [11:52:25<9:21:17, 90.78s/it]  

([6, 2, 2], [6, 2, 2])
tensor(11.8744, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 49%|████▉     | 355/725 [11:52:54<7:25:35, 72.26s/it]

([8, 2, 1], [8, 2, 1])
tensor(5.9906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 49%|████▉     | 356/725 [11:53:19<5:57:32, 58.14s/it]

([1, 2, 7], [1, 2, 7])
tensor(8.6777, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.210194930434227)]
([1, 2, 7], [1, 2, 7])
tensor(8.8068, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -8.658672332763672)]
([1, 2, 7], [1, 2, 7])
tensor(7.2875, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.2445807456970215)]
([1, 2, 0], [1, 2, 7])
tensor(5.7524, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.388641357421875)]
([1, 2, 0], [1, 2, 7])
tensor(5.7043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.447589874267578)]
([1, 2, 0], [1, 2, 7])
tensor(5.6965, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.505563735961914)]
([1, 2, 0], [1, 2, 7])
tensor(5.6943, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.5096807479

 49%|████▉     | 357/725 [11:58:18<13:19:41, 130.38s/it]

test：0.0, test mean: 0.21236559139784944
([7, 5, 8], [7, 3, 8])
tensor(5.4105, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.358941912651062), (499999, 0.0), (499999, 0.0)]
([7, 5, 8], [7, 3, 8])
tensor(5.7177, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.3430662155151367), (499999, 0.0), (499999, 0.0)]
([7, 5, 8], [7, 3, 8])
tensor(3.9031, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 6.143567085266113), (499999, 0.0), (499999, 0.0)]
([7, 5, 8], [7, 3, 8])
tensor(2.0280, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.155037879943848), (499999, 0.0), (499999, 0.0)]
([0, 5, 8], [7, 3, 8])
tensor(1.7342, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 7.825559616088867), (499999, 0.0), (499999, 0.0)]
([0, 5, 8], [7, 3, 8])
tensor(1.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 9.542673110961914), (499999, 0.0), (499999, 0.0)]
([0, 5, 8], [7, 3, 8])
tensor(1.6509, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.14

 49%|████▉     | 358/725 [12:02:56<17:48:46, 174.73s/it]

test：0.0, test mean: 0.21066666666666667
([2, 7, 2], [2, 4, 2])
tensor(10.5524, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 50%|████▉     | 359/725 [12:03:27<13:22:02, 131.48s/it]

([2, 7, 7], [2, 7, 7])
tensor(10.6574, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.22118140757083893), (380927, 0.2102208137512207)]
([2, 7, 7], [2, 7, 7])
tensor(10.5567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -6.5108442306518555), (380927, -9.797578811645508)]
([2, 7, 7], [2, 7, 7])
tensor(7.2891, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.60699462890625), (380927, -4.089019298553467)]
([2, 0, 7], [2, 7, 7])
tensor(4.2758, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.820326328277588), (380927, 0.18599677085876465)]
([2, 0, 0], [2, 7, 7])
tensor(3.8576, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.965710163116455), (380927, -0.7103869915008545)]
([2, 0, 0], [2, 7, 7])
tensor(3.7819, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.0838122367858887), (380927, -1.0239548683166504)]
([2, 0, 0], [2, 7, 7])
tensor(3.7649, d

 50%|████▉     | 360/725 [12:08:34<18:39:42, 184.06s/it]

test：0.0, test mean: 0.20899470899470898
([1, 2, 7], [1, 2, 7])
tensor(6.3527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 50%|████▉     | 361/725 [12:09:00<13:50:15, 136.85s/it]

([2, 0, 1], [2, 0, 1])
tensor(5.6056, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 50%|████▉     | 362/725 [12:09:26<10:26:30, 103.56s/it]

([2, 8, 1], [2, 8, 1])
tensor(8.9867, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 50%|█████     | 363/725 [12:09:52<8:03:30, 80.14s/it]  

([7, 2, 7], [7, 2, 7])
tensor(10.3532, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.209797203540802), (499999, 0.0), (380927, 0.21018260717391968)]
([7, 2, 7], [7, 2, 7])
tensor(10.4367, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.427309513092041), (499999, 0.0), (380927, -9.092065811157227)]
([7, 2, 7], [7, 2, 7])
tensor(7.5840, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.8306646347045898), (499999, 0.0), (380927, -4.019266128540039)]
([0, 2, 0], [7, 2, 7])
tensor(5.1669, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.8815240859985352), (499999, 0.0), (380927, -3.8783671855926514)]
([0, 2, 0], [7, 2, 7])
tensor(5.0795, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.9186561107635498), (499999, 0.0), (380927, -4.246049404144287)]
([0, 2, 0], [7, 2, 7])
tensor(5.0573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.9071260690689087), (499999, 0.0), (380927, -4.575871467590332)]
([0, 2, 0], [7, 2, 7])
tensor(5.0416, devic

 50%|█████     | 364/725 [12:14:57<14:48:21, 147.65s/it]

test：0.0, test mean: 0.20734908136482938
([2, 5, 2], [2, 5, 2])
tensor(9.7591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0012476826086640358), (499999, 0.0)]


 50%|█████     | 365/725 [12:15:23<11:07:46, 111.29s/it]

([7, 2, 0], [7, 2, 0])
tensor(7.6899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21000058948993683), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(7.6583, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.220324516296387), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(6.2990, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.391488552093506), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(5.0924, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.346743583679199), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(5.0894, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.317600727081299), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(5.0874, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.289721488952637), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(5.0861, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.265408039093018), (499999, 0.0), (49999

 50%|█████     | 366/725 [12:19:58<15:58:08, 160.13s/it]

test：0.0, test mean: 0.20572916666666669
([8, 2, 5], [8, 2, 5])
tensor(6.1043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 51%|█████     | 367/725 [12:20:23<11:54:50, 119.81s/it]

([1, 2, 7], [1, 2, 7])
tensor(8.8846, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.20696350932121277)]
([1, 2, 7], [1, 2, 7])
tensor(8.7628, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -11.151535987854004)]
([1, 2, 7], [1, 2, 7])
tensor(7.2509, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -7.304470062255859)]
([1, 2, 0], [1, 2, 7])
tensor(6.7745, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -7.367090225219727)]
([1, 2, 0], [1, 2, 7])
tensor(6.7732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -7.420875549316406)]
([1, 2, 0], [1, 2, 7])
tensor(6.7721, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -7.466253280639648)]
([1, 2, 0], [1, 2, 7])
tensor(6.7719, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -7.50628519

 51%|█████     | 368/725 [12:25:26<17:19:39, 174.73s/it]

test：0.0, test mean: 0.2041343669250646
([8, 3, 2], [8, 3, 2])
tensor(5.6255, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.3552554249763489), (499999, 0.0), (499999, 0.0)]
([7, 3, 2], [8, 3, 2])
tensor(5.7713, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 4.9181413650512695), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [8, 3, 2])
tensor(5.0516, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 5.035492897033691), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [8, 3, 2])
tensor(5.0413, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 5.080399513244629), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [8, 3, 2])
tensor(5.0400, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 5.1199188232421875), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [8, 3, 2])
tensor(5.0393, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 5.151701927185059), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [8, 3, 2])
tensor(5.0385, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 5.1793

 51%|█████     | 369/725 [12:30:13<20:36:39, 208.42s/it]

test：0.0, test mean: 0.20256410256410257
([7, 5, 7], [7, 5, 7])
tensor(5.8963, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21009838581085205), (499999, 0.0), (380927, 0.20912925899028778)]
([7, 5, 7], [7, 5, 7])
tensor(5.4964, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.346307754516602), (499999, 0.0), (380927, -3.445448398590088)]
([7, 5, 7], [7, 5, 7])
tensor(3.3310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.1118532419204712), (499999, 0.0), (380927, 5.5667195320129395)]
([0, 5, 0], [7, 5, 7])
tensor(1.0913, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.5089664459228516), (499999, 0.0), (380927, 5.7911882400512695)]
([0, 5, 0], [7, 5, 7])
tensor(1.0475, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.7907742261886597), (499999, 0.0), (380927, 5.9163498878479)]
([0, 5, 0], [7, 5, 7])
tensor(1.0339, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.9769271612167358), (499999, 0.0), (380927, 6.027829647064209)]
([0

 51%|█████     | 370/725 [12:35:03<22:57:47, 232.87s/it]

test：0.0, test mean: 0.2010178117048346
([0, 2, 1], [0, 2, 1])
tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.008922009728848934), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.017821498215198517), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.026720985770225525), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.03562047332525253), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.04451996088027954), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.05341944843530655), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.8953, device='cuda:0', grad_fn=<NllLossBackward0>

 51%|█████     | 371/725 [12:39:35<24:02:06, 244.43s/it]

test：1.0, test mean: 0.20707070707070707
([3, 0, 5], [3, 7, 8])
tensor(1.9773, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -0.8460078239440918), (499999, 0.0), (499999, 0.0)]
([3, 0, 5], [3, 7, 8])
tensor(1.8763, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -1.8083972930908203), (499999, 0.0), (499999, 0.0)]
([0, 0, 5], [3, 7, 8])
tensor(1.0031, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -2.1512837409973145), (499999, 0.0), (499999, 0.0)]
([0, 0, 5], [3, 7, 8])
tensor(0.9305, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -2.0953612327575684), (499999, 0.0), (499999, 0.0)]
([0, 0, 5], [3, 7, 8])
tensor(0.9117, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -2.1023311614990234), (499999, 0.0), (499999, 0.0)]
([0, 0, 5], [3, 7, 8])
tensor(0.9047, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -2.1684205532073975), (499999, 0.0), (499999, 0.0)]
([0, 0, 5], [3, 7, 8])
tensor(0.8986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(2334

 51%|█████▏    | 372/725 [12:44:07<24:46:55, 252.74s/it]

test：0.0, test mean: 0.20551378446115287
([8, 2, 7], [8, 2, 7])
tensor(9.6516, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.022890496999025345), (499999, 0.0), (380927, -0.35890501737594604)]
([8, 2, 7], [8, 2, 7])
tensor(10.0335, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.04578099399805069), (499999, 0.0), (380927, 2.579481363296509)]
([8, 2, 7], [8, 2, 7])
tensor(8.5868, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.06867150962352753), (499999, 0.0), (380927, 8.433939933776855)]
([8, 2, 0], [8, 2, 7])
tensor(6.9826, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.09156198799610138), (499999, 0.0), (380927, 8.438142776489258)]
([8, 2, 0], [8, 2, 7])
tensor(6.9802, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.1144525408744812), (499999, 0.0), (380927, 8.444393157958984)]
([8, 2, 0], [8, 2, 7])
tensor(6.9786, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.13734297454357147), (499999, 0.0), (380927, 8.4379501342773

 51%|█████▏    | 373/725 [12:48:55<25:45:48, 263.49s/it]

test：0.0, test mean: 0.20398009950248755
([5, 2, 2], [8, 2, 2])
tensor(9.1596, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 374/725 [12:49:20<18:42:37, 191.90s/it]

([2, 0, 1], [2, 7, 1])
tensor(6.2753, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 375/725 [12:49:44<13:45:22, 141.49s/it]

([0, 2, 1], [1, 2, 1])
tensor(6.7451, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 376/725 [12:50:06<10:14:38, 105.67s/it]

([3, 1, 2], [3, 1, 2])
tensor(7.1912, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -0.8064886331558228), (499999, 0.0), (499999, 0.00038114842027425766)]
([3, 1, 2], [3, 1, 2])
tensor(7.1625, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -5.810754776000977), (499999, 0.0), (499999, 0.0007622968405485153)]
([3, 1, 2], [3, 1, 2])
tensor(6.4980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -3.597550392150879), (499999, 0.0), (499999, 0.0011434454936534166)]
([0, 1, 2], [3, 1, 2])
tensor(6.1658, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -3.314894437789917), (499999, 0.0), (499999, 0.0015245936810970306)]
([0, 1, 2], [3, 1, 2])
tensor(6.1503, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -2.7323551177978516), (499999, 0.0), (499999, 0.001905742334201932)]
([0, 1, 2], [3, 1, 2])
tensor(6.1148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -2.211696147918701), (499999, 0.0), (499999, 0.002286889823153615)]
([0, 1, 2], [3, 1, 2])
tens

 52%|█████▏    | 377/725 [12:54:35<14:56:23, 154.55s/it]

test：0.0, test mean: 0.2024691358024691
([1, 2, 1], [1, 2, 1])
tensor(8.4392, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 378/725 [12:54:57<11:04:26, 114.89s/it]

([8, 8, 1], [8, 8, 1])
tensor(4.2615, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.5622351169586182), (499999, 0.0)]
([8, 8, 1], [8, 8, 1])
tensor(4.4279, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 4.976137161254883), (499999, 0.0)]
([8, 0, 1], [8, 8, 1])
tensor(3.4354, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 4.600195407867432), (499999, 0.0)]
([8, 0, 1], [8, 8, 1])
tensor(3.4268, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 4.288094520568848), (499999, 0.0)]
([8, 0, 1], [8, 8, 1])
tensor(3.4227, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 4.115870475769043), (499999, 0.0)]
([8, 0, 1], [8, 8, 1])
tensor(3.4203, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 4.0752458572387695), (499999, 0.0)]
([8, 0, 1], [8, 8, 1])
tensor(3.4195, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 4.014261245727539), (499999, 0.0

 52%|█████▏    | 379/725 [12:59:25<15:27:20, 160.81s/it]

test：0.0, test mean: 0.20098039215686272
([3, 1, 2], [7, 1, 2])
tensor(5.8605, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 380/725 [12:59:50<11:30:02, 120.01s/it]

([1, 3, 2], [1, 4, 2])
tensor(6.8395, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 53%|█████▎    | 381/725 [13:00:18<8:49:24, 92.34s/it]  

([2, 8, 2], [2, 8, 2])
tensor(12.7529, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 53%|█████▎    | 382/725 [13:00:42<6:52:19, 72.13s/it]

([3, 1, 1], [3, 1, 1])
tensor(5.5360, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -0.8058122992515564), (499999, 0.0), (499999, 0.0)]
([3, 1, 1], [3, 1, 1])
tensor(5.4817, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -3.920748710632324), (499999, 0.0), (499999, 0.0)]
([3, 1, 1], [3, 1, 1])
tensor(4.5362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.6643745303153992), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [3, 1, 1])
tensor(4.0793, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 1.0548076629638672), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [3, 1, 1])
tensor(4.0231, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 1.5902860164642334), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [3, 1, 1])
tensor(4.0069, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 1.9931199550628662), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [3, 1, 1])
tensor(3.9947, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 2.1749396324157715), (499999, 0.0), (49999

 53%|█████▎    | 383/725 [13:05:12<12:29:16, 131.45s/it]

test：0.0, test mean: 0.1995133819951338
([2, 0, 6], [2, 0, 6])
tensor(4.7336, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 53%|█████▎    | 384/725 [13:05:36<9:22:49, 99.03s/it]  

([0, 2, 3], [0, 2, 4])
tensor(5.1341, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 53%|█████▎    | 385/725 [13:05:58<7:10:32, 75.98s/it]

([1, 1, 3], [1, 1, 3])
tensor(5.4461, device='cuda:0', grad_fn=<NllLossBackward0>)


 53%|█████▎    | 386/725 [13:06:23<5:43:32, 60.81s/it]

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([2, 8, 2], [2, 8, 2])
tensor(9.6222, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 53%|█████▎    | 387/725 [13:06:48<4:40:57, 49.87s/it]

([3, 5, 1], [3, 3, 1])
tensor(5.4761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0001267482730327174), (499999, -0.010026199743151665), (499999, 0.0)]


 54%|█████▎    | 388/725 [13:07:13<3:59:00, 42.55s/it]

([2, 2, 1], [2, 2, 1])
tensor(9.1168, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▎    | 389/725 [13:07:39<3:30:19, 37.56s/it]

([0, 2, 2], [0, 2, 2])
tensor(8.1713, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▍    | 390/725 [13:08:03<3:07:32, 33.59s/it]

([6, 1, 8], [6, 1, 8])
tensor(5.4631, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▍    | 391/725 [13:08:29<2:53:43, 31.21s/it]

([0, 0, 8], [8, 0, 8])
tensor(1.9200, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▍    | 392/725 [13:08:54<2:42:00, 29.19s/it]

([1, 0, 0], [1, 0, 0])
tensor(2.8295, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▍    | 393/725 [13:09:19<2:35:12, 28.05s/it]

([2, 0, 1], [2, 0, 1])
tensor(5.6526, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▍    | 394/725 [13:09:45<2:32:07, 27.57s/it]

([2, 2, 7], [2, 2, 7])
tensor(10.3692, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.21020494401454926)]
([2, 2, 7], [2, 2, 7])
tensor(10.4072, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -9.0734224319458)]
([2, 2, 7], [2, 2, 7])
tensor(8.8521, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.776402950286865)]
([2, 2, 0], [2, 2, 7])
tensor(7.2866, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.157309532165527)]
([2, 2, 0], [2, 2, 7])
tensor(7.2105, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -3.927445650100708)]
([2, 2, 0], [2, 2, 7])
tensor(7.2068, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -3.9300317764282227)]
([2, 2, 0], [2, 2, 7])
tensor(7.2044, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -3.89909696

 54%|█████▍    | 395/725 [13:14:40<9:52:26, 107.72s/it]

test：0.0, test mean: 0.19806763285024154
([1, 0, 5], [1, 0, 8])
tensor(2.8505, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 55%|█████▍    | 396/725 [13:15:06<7:36:02, 83.17s/it] 

([3, 0, 8], [3, 0, 8])
tensor(3.2708, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 55%|█████▍    | 397/725 [13:15:31<5:58:56, 65.66s/it]

([0, 2, 2], [0, 2, 2])
tensor(9.2264, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 55%|█████▍    | 398/725 [13:15:56<4:50:59, 53.39s/it]

([2, 7, 0], [2, 7, 0])
tensor(8.4814, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21018782258033752), (499999, 0.0)]
([2, 7, 0], [2, 7, 0])
tensor(8.4832, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -8.257783889770508), (499999, 0.0)]
([2, 7, 0], [2, 7, 0])
tensor(6.8940, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.9816009998321533), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(5.5318, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.131777286529541), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(5.5151, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.333868980407715), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(5.5106, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.458511829376221), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(5.5062, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.54181432723999), (49999

 55%|█████▌    | 399/725 [13:20:25<10:42:26, 118.24s/it]

test：0.0, test mean: 0.19664268585131894
([8, 5, 2], [8, 8, 2])
tensor(7.3869, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 55%|█████▌    | 400/725 [13:20:50<8:08:25, 90.17s/it]  

([2, 5, 2], [2, 8, 2])
tensor(8.5331, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 55%|█████▌    | 401/725 [13:21:16<6:22:44, 70.88s/it]

([1, 7, 6], [1, 7, 6])
tensor(6.9279, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.20999811589717865), (499999, 0.0)]
([1, 7, 6], [1, 7, 6])
tensor(6.9759, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -7.863246917724609), (499999, 0.0)]
([1, 7, 6], [1, 7, 6])
tensor(5.6313, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.4809257984161377), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(4.3229, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.4164016246795654), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(4.3153, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.413110613822937), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(4.3110, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.5023510456085205), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(4.3089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.563069224357605), (49

 55%|█████▌    | 402/725 [13:26:12<12:25:32, 138.49s/it]

test：0.0, test mean: 0.19523809523809524
([2, 1, 7], [2, 1, 7])
tensor(8.4460, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.20670683681964874)]
([2, 1, 7], [2, 1, 7])
tensor(8.3229, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -12.544157028198242)]
([2, 1, 7], [2, 1, 7])
tensor(6.7980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -10.04748821258545)]
([2, 1, 0], [2, 1, 7])
tensor(6.2730, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -10.123933792114258)]
([2, 1, 0], [2, 1, 7])
tensor(6.2727, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -10.198262214660645)]
([2, 1, 0], [2, 1, 7])
tensor(6.2723, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -10.27178955078125)]
([2, 1, 0], [2, 1, 7])
tensor(6.2717, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999

 56%|█████▌    | 403/725 [13:31:20<16:56:50, 189.47s/it]

test：0.0, test mean: 0.1938534278959811
([1, 8, 2], [1, 8, 2])
tensor(6.9180, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 56%|█████▌    | 404/725 [13:31:46<12:30:34, 140.29s/it]

([1, 1, 2], [1, 1, 2])
tensor(7.9066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.003662622533738613)]


 56%|█████▌    | 405/725 [13:32:10<9:22:10, 105.41s/it] 

([7, 8, 5], [4, 8, 8])
tensor(2.7620, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 56%|█████▌    | 406/725 [13:32:36<7:14:12, 81.67s/it] 

([0, 8, 2], [0, 8, 2])
tensor(5.9734, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.003662576200440526)]


 56%|█████▌    | 407/725 [13:33:02<5:44:15, 64.95s/it]

([1, 2, 0], [1, 2, 0])
tensor(5.9859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 56%|█████▋    | 408/725 [13:33:30<4:43:44, 53.71s/it]

([2, 1, 5], [2, 1, 5])
tensor(5.4621, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 56%|█████▋    | 409/725 [13:33:56<4:00:08, 45.60s/it]

([7, 5, 5], [7, 2, 5])
tensor(6.6610, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21022117137908936), (499999, 0.0), (499999, 0.0)]
([7, 5, 5], [7, 2, 5])
tensor(6.4531, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.797614097595215), (499999, 0.0), (499999, 0.0)]
([7, 5, 5], [7, 2, 5])
tensor(4.7926, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.085722923278809), (499999, 0.0), (499999, 0.0)]
([7, 5, 5], [7, 2, 5])
tensor(3.2349, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.4130607843399048), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [7, 2, 5])
tensor(2.8234, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.9216880798339844), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [7, 2, 5])
tensor(2.7817, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.156350612640381), (499999, 0.0), (499999, 0.0)]
([0, 5, 5], [7, 2, 5])
tensor(2.7623, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.356137275695801), (499999, 0.0), (499

 57%|█████▋    | 410/725 [13:38:42<10:17:04, 117.54s/it]

test：0.0, test mean: 0.19248826291079812
([3, 2, 7], [3, 2, 7])
tensor(8.6016, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.019420698285102844), (380927, 0.2096850574016571)]
([3, 2, 7], [3, 2, 7])
tensor(8.5442, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.03884139657020569), (380927, -3.041455030441284)]
([3, 2, 7], [3, 2, 7])
tensor(7.2984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.05826208367943764), (380927, 4.418546676635742)]
([3, 2, 0], [3, 2, 7])
tensor(6.2380, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.07768279314041138), (380927, 4.804304122924805)]
([3, 2, 0], [3, 2, 7])
tensor(6.2260, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.09710348397493362), (380927, 5.050455570220947)]
([3, 2, 0], [3, 2, 7])
tensor(6.2241, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.11652418971061707), (380927, 5.274017333984375)]
([

 57%|█████▋    | 411/725 [13:43:38<14:56:26, 171.29s/it]

test：0.0, test mean: 0.19114219114219114
([2, 2, 5], [2, 2, 5])
tensor(9.3791, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 57%|█████▋    | 412/725 [13:44:03<11:04:08, 127.31s/it]

([3, 3, 8], [1, 3, 5])
tensor(2.8381, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 57%|█████▋    | 413/725 [13:44:30<8:24:46, 97.07s/it]  

([7, 0, 2], [7, 7, 2])
tensor(5.6404, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21019919216632843), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 7, 2])
tensor(5.6687, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.092445373535156), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 7, 2])
tensor(4.1942, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.873831748962402), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(2.6653, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.992379188537598), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(2.6063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.143834590911865), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(2.6000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.168493270874023), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(2.5989, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.179717063903809), (499999, 0.0), (49999

 57%|█████▋    | 414/725 [13:48:57<12:47:56, 148.16s/it]

test：0.0, test mean: 0.1898148148148148
([2, 5, 2], [2, 5, 2])
tensor(10.3831, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 57%|█████▋    | 415/725 [13:49:22<9:35:01, 111.30s/it] 

([2, 0, 2], [2, 1, 2])
tensor(10.3502, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 57%|█████▋    | 416/725 [13:49:49<7:22:16, 85.88s/it] 

([2, 5, 2], [2, 8, 2])
tensor(6.1803, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 58%|█████▊    | 417/725 [13:50:14<5:47:23, 67.67s/it]

([3, 7, 7], [3, 7, 7])
tensor(8.1828, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21018628776073456), (380927, -0.3589419722557068)]
([3, 7, 7], [3, 7, 7])
tensor(8.4672, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.091777801513672), (380927, -1.3430649042129517)]
([3, 7, 7], [3, 7, 7])
tensor(5.0679, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.040359020233154), (380927, 6.142421722412109)]
([3, 0, 7], [3, 7, 7])
tensor(1.9258, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.9016292095184326), (380927, 5.202284812927246)]
([3, 0, 0], [3, 7, 7])
tensor(1.6006, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.110674858093262), (380927, 7.287539958953857)]
([3, 0, 0], [3, 7, 7])
tensor(1.5331, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.210945129394531), (380927, 8.849128723144531)]
([3, 0, 0], [3, 7, 7])
tensor(1.5073, device='

 58%|█████▊    | 418/725 [13:55:20<11:52:25, 139.24s/it]

test：0.0, test mean: 0.18850574712643678
([0, 2, 0], [0, 2, 0])
tensor(6.7909, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 58%|█████▊    | 419/725 [13:55:46<8:56:48, 105.26s/it] 

([7, 2, 7], [7, 2, 7])
tensor(11.0039, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21018639206886292), (499999, 0.0), (380927, 0.21022075414657593)]
([7, 2, 7], [7, 2, 7])
tensor(10.8278, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.091885566711426), (499999, 0.0), (380927, -9.519343376159668)]
([7, 2, 7], [7, 2, 7])
tensor(7.6001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.905927658081055), (499999, 0.0), (380927, -4.029472351074219)]
([0, 2, 7], [7, 2, 7])
tensor(4.6628, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.817892074584961), (499999, 0.0), (380927, -1.768984317779541)]
([0, 2, 0], [7, 2, 7])
tensor(4.3231, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.035215854644775), (499999, 0.0), (380927, -2.024782180786133)]
([0, 2, 0], [7, 2, 7])
tensor(4.2581, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.107849597930908), (499999, 0.0), (380927, -1.1533374786376953)]
([0, 2, 0], [7, 2, 7])
tensor(4.2353, dev

 58%|█████▊    | 420/725 [14:00:44<13:49:25, 163.17s/it]

test：0.0, test mean: 0.1872146118721461
([7, 2, 1], [7, 2, 1])
tensor(7.4583, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 58%|█████▊    | 421/725 [14:01:11<10:19:43, 122.31s/it]

([0, 1, 2], [0, 1, 2])
tensor(8.9896, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 58%|█████▊    | 422/725 [14:01:38<7:53:18, 93.72s/it]  

([2, 8, 7], [2, 8, 7])
tensor(7.7568, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0005885895807296038), (499999, 0.0), (380927, 0.2098156362771988)]
([2, 8, 7], [2, 8, 7])
tensor(7.6982, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0011771791614592075), (499999, 0.0), (380927, -4.429378032684326)]
([2, 8, 7], [2, 8, 7])
tensor(6.3858, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0017657687421888113), (499999, 0.0), (380927, 1.875401496887207)]
([2, 8, 0], [2, 8, 7])
tensor(5.2636, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.002354358322918415), (499999, 0.0), (380927, 1.8988202810287476)]
([2, 8, 0], [2, 8, 7])
tensor(5.2606, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.002942947670817375), (499999, 0.0), (380927, 1.9084076881408691)]
([2, 8, 0], [2, 8, 7])
tensor(5.2591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0035315374843776226), (499999, 0.0), (380927, 1.9182312488555908)]
([2, 8, 0], [2, 8, 7])
tensor(5

 58%|█████▊    | 423/725 [14:06:50<13:20:42, 159.08s/it]

test：0.0, test mean: 0.18594104308390022
([2, 6, 0], [2, 6, 0])
tensor(6.1477, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 0.0001836444716900587)]
([2, 6, 0], [2, 6, 0])
tensor(6.2008, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 1.3367435932159424)]
([2, 6, 0], [2, 6, 0])
tensor(6.1396, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 1.4968078136444092)]
([2, 6, 0], [2, 6, 0])
tensor(6.1548, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 2.0105843544006348)]
([2, 6, 0], [2, 6, 0])
tensor(6.1314, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 2.0237205028533936)]
([2, 6, 0], [2, 6, 0])
tensor(6.1313, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 2.033808708190918)]
([2, 6, 0], [2, 6, 0])
tensor(6.1313, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 58%|█████▊    | 424/725 [14:11:48<16:47:10, 200.76s/it]

test：1.0, test mean: 0.19144144144144143
([7, 2, 2], [7, 2, 2])
tensor(11.3618, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21019630134105682), (499999, 0.0), (499999, -0.0004088284040335566)]
([7, 2, 2], [7, 2, 2])
tensor(11.4099, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.092459678649902), (499999, 0.0), (499999, -0.0008176568080671132)]
([7, 2, 2], [7, 2, 2])
tensor(9.8410, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.0101776123046875), (499999, 0.0), (499999, -0.001226485357619822)]
([0, 2, 2], [7, 2, 2])
tensor(8.4589, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.766148328781128), (499999, 0.0), (499999, -0.0016353136161342263)]
([0, 2, 2], [7, 2, 2])
tensor(8.3532, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.7500176429748535), (499999, 0.0), (499999, -0.0020441424567252398)]
([0, 2, 2], [7, 2, 2])
tensor(8.3329, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.804826259613037), (499999, 0.0), (499999, -0

 59%|█████▊    | 425/725 [14:16:20<18:30:19, 222.06s/it]

test：0.0, test mean: 0.1901565995525727
([1, 2, 3], [1, 2, 3])
tensor(6.1795, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 59%|█████▉    | 426/725 [14:16:45<13:32:31, 163.05s/it]

([2, 2, 7], [2, 2, 7])
tensor(7.8174, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 59%|█████▉    | 427/725 [14:17:10<10:04:17, 121.67s/it]

([1, 5, 0], [1, 5, 0])
tensor(3.0137, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.010024303570389748)]
([1, 5, 0], [1, 5, 0])
tensor(3.1148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, -2.3175742626190186)]
([1, 5, 0], [1, 5, 0])
tensor(3.0030, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, -2.4337751865386963)]
([1, 5, 0], [1, 5, 0])
tensor(3.0531, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, -2.438664436340332)]
([1, 5, 0], [1, 5, 0])
tensor(2.9929, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, -2.4707255363464355)]
([1, 5, 0], [1, 5, 0])
tensor(2.9940, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, -2.512904167175293)]
([1, 5, 0], [1, 5, 0])
tensor(2.9924, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, -2.52841

 59%|█████▉    | 428/725 [14:21:59<14:10:31, 171.82s/it]

test：1.0, test mean: 0.19555555555555554
([5, 0, 6], [0, 0, 6])
tensor(2.8631, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 59%|█████▉    | 429/725 [14:22:25<10:31:29, 128.01s/it]

([1, 5, 0], [1, 5, 0])
tensor(2.6359, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 59%|█████▉    | 430/725 [14:22:51<7:59:07, 97.45s/it]  

([0, 7, 2], [0, 7, 2])
tensor(9.2074, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2102212905883789), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(9.0009, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.797612190246582), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(7.3415, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.085083961486816), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(5.7750, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.38894784450531), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(5.3549, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.8673508167266846), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(5.3127, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.1621358394622803), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(5.2913, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.0904126167297363), (4999

 59%|█████▉    | 431/725 [14:27:25<12:16:40, 150.34s/it]

test：0.0, test mean: 0.19426048565121412
([0, 8, 1], [0, 5, 1])
tensor(4.0363, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.008899497799575329), (499999, -0.0015522923786193132), (499999, 0.0)]
([0, 8, 1], [0, 5, 1])
tensor(4.1046, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -2.171104669570923), (499999, -0.0031045847572386265), (499999, 0.0)]
([0, 8, 1], [0, 5, 1])
tensor(4.0364, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -2.1946728229522705), (499999, -0.004656876437366009), (499999, 0.0)]
([0, 8, 1], [0, 5, 1])
tensor(4.0933, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -3.8710737228393555), (499999, -0.006209169514477253), (499999, 0.0)]
([0, 8, 1], [0, 5, 1])
tensor(4.0322, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -3.7751340866088867), (499999, -0.007761460728943348), (499999, 0.0)]
([0, 8, 1], [0, 5, 1])
tensor(4.0300, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -3.8116238117218018), (499999, -0.00931375287473

 60%|█████▉    | 432/725 [14:31:49<15:01:09, 184.54s/it]

test：1.0, test mean: 0.1995614035087719
([7, 0, 0], [7, 0, 0])
tensor(1.3416, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 0.005892084911465645), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.4068, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 0.7535805702209473), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.3325, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 0.9874066114425659), (499999, 0.0)]
([7, 7, 0], [7, 0, 0])
tensor(1.8223, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 4.452584743499756), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.3242, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 4.456094741821289), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.3242, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 4.4596052169799805), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.3242, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0

 60%|█████▉    | 433/725 [14:36:18<17:00:55, 209.78s/it]

test：1.0, test mean: 0.2047930283224401
([2, 3, 1], [2, 3, 1])
tensor(8.3733, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 3.782251224038191e-05), (499999, 0.0), (499999, 0.0)]


 60%|█████▉    | 434/725 [14:36:45<12:31:47, 155.01s/it]

([5, 2, 1], [0, 2, 1])
tensor(6.9776, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 60%|██████    | 435/725 [14:37:10<9:20:53, 116.05s/it] 

([1, 5, 2], [1, 1, 2])
tensor(7.6611, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 60%|██████    | 436/725 [14:37:35<7:06:23, 88.52s/it] 

([8, 2, 2], [8, 2, 2])
tensor(10.4502, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 60%|██████    | 437/725 [14:38:00<5:34:04, 69.60s/it]

([0, 2, 0], [0, 2, 0])
tensor(5.3546, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -0.012294670566916466), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(5.4016, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -1.1015403270721436), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(5.3538, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -1.189828872680664), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(5.3894, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -2.326700448989868), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(5.3454, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -2.373839855194092), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(5.3446, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -2.4121623039245605), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(5.3438, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -2.4581103324890137), (499999, 0.0), (

 60%|██████    | 438/725 [14:42:25<10:14:00, 128.37s/it]

test：1.0, test mean: 0.20995670995670992
([7, 1, 2], [7, 1, 2])
tensor(10.3861, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21020369231700897), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(10.2576, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.092506408691406), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(8.6820, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.882821083068848), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(7.2212, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.489272117614746), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(7.1813, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.52478551864624), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(7.1759, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.503794193267822), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(7.1735, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 

 61%|██████    | 439/725 [14:46:46<13:21:17, 168.10s/it]

test：0.0, test mean: 0.20860215053763437
([2, 2, 2], [2, 2, 2])
tensor(14.3617, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 440/725 [14:47:11<9:54:32, 125.17s/it] 

([1, 5, 1], [1, 0, 1])
tensor(5.0558, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 441/725 [14:47:36<7:29:33, 94.98s/it] 

([5, 6, 0], [0, 6, 0])
tensor(2.2802, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 442/725 [14:48:02<5:50:33, 74.32s/it]

([7, 1, 5], [7, 1, 1])
tensor(4.0732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 443/725 [14:48:26<4:38:20, 59.22s/it]

([1, 2, 0], [1, 2, 0])
tensor(7.6337, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 444/725 [14:48:50<3:48:25, 48.77s/it]

([7, 1, 5], [7, 1, 8])
tensor(2.8905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████▏   | 445/725 [14:49:15<3:13:35, 41.49s/it]

([8, 8, 0], [8, 8, 0])
tensor(2.2862, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 446/725 [14:49:40<2:49:32, 36.46s/it]

([6, 7, 7], [6, 7, 0])
tensor(6.4534, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.3589215576648712), (499999, 0.0)]
([6, 7, 7], [6, 7, 0])
tensor(6.8819, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.680118203163147), (499999, 0.0)]
([6, 7, 7], [6, 7, 0])
tensor(5.1620, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.597724437713623), (499999, 0.0)]
([6, 0, 7], [6, 7, 0])
tensor(3.3211, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.53392219543457), (499999, 0.0)]
([6, 0, 7], [6, 7, 0])
tensor(3.2972, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.549745559692383), (499999, 0.0)]
([6, 0, 7], [6, 7, 0])
tensor(3.2916, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.522158622741699), (499999, 0.0)]
([6, 0, 7], [6, 7, 0])
tensor(3.2893, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.641390323638916), (499999, 0.0

 62%|██████▏   | 447/725 [14:54:12<8:17:17, 107.33s/it]

test：0.0, test mean: 0.20726495726495722
([2, 7, 1], [2, 7, 1])
tensor(9.9508, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.20689894258975983), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(9.8010, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -11.351507186889648), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(8.3781, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -6.145116329193115), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.7692, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -6.203683376312256), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.7679, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -6.247578144073486), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.7668, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -6.280647277832031), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(7.7664, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 62%|██████▏   | 448/725 [14:58:43<12:02:17, 156.45s/it]

test：0.0, test mean: 0.20594479830148618
([2, 7, 2], [2, 7, 2])
tensor(8.5196, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00012307032011449337), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 449/725 [14:59:08<8:57:14, 116.79s/it] 

([8, 2, 0], [8, 2, 0])
tensor(5.6200, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0006719997036270797), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 450/725 [14:59:33<6:49:04, 89.25s/it] 

([1, 0, 7], [1, 0, 7])
tensor(3.7847, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 451/725 [14:59:57<5:18:20, 69.71s/it]

([2, 3, 1], [2, 3, 1])
tensor(5.5930, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 452/725 [15:00:21<4:15:54, 56.24s/it]

([7, 2, 5], [7, 2, 5])
tensor(5.7934, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 453/725 [15:00:46<3:31:41, 46.69s/it]

([1, 1, 3], [1, 1, 3])
tensor(4.6770, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 63%|██████▎   | 454/725 [15:01:09<2:59:30, 39.74s/it]

([1, 1, 7], [1, 1, 7])
tensor(7.6149, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.35891270637512207)]
([1, 1, 7], [1, 1, 7])
tensor(8.0763, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -2.1391854286193848)]
([1, 1, 7], [1, 1, 7])
tensor(6.3811, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.568924903869629)]
([1, 1, 0], [1, 1, 7])
tensor(4.5522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.3546605110168457)]
([1, 1, 0], [1, 1, 7])
tensor(4.5444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.2845582962036133)]
([1, 1, 0], [1, 1, 7])
tensor(4.5414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.418170928955078)]
([1, 1, 0], [1, 1, 7])
tensor(4.5390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.3956136703

 63%|██████▎   | 455/725 [15:06:08<8:47:53, 117.31s/it]

test：0.0, test mean: 0.20464135021097044
([0, 1, 5], [0, 1, 5])
tensor(2.8258, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 63%|██████▎   | 456/725 [15:06:33<6:42:38, 89.81s/it] 

([8, 1, 2], [8, 1, 2])
tensor(8.6469, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 63%|██████▎   | 457/725 [15:07:00<5:16:46, 70.92s/it]

([3, 0, 2], [3, 0, 2])
tensor(4.1838, device='cuda:0', grad_fn=<NllLossBackward0>)
[(245759, -0.748542308807373), (499999, 0.0), (499999, 0.0)]
([3, 0, 2], [3, 0, 2])
tensor(4.1259, device='cuda:0', grad_fn=<NllLossBackward0>)
[(245759, -2.405029773712158), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(3.2777, device='cuda:0', grad_fn=<NllLossBackward0>)
[(245759, -1.3352782726287842), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(3.1572, device='cuda:0', grad_fn=<NllLossBackward0>)
[(245759, -1.746164083480835), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(3.1199, device='cuda:0', grad_fn=<NllLossBackward0>)
[(245759, -1.7591248750686646), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(3.1070, device='cuda:0', grad_fn=<NllLossBackward0>)
[(245759, -1.8138898611068726), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(3.1055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(245759, -1.8392709493637085), (499999, 0.0), (49

 63%|██████▎   | 458/725 [15:11:42<9:57:45, 134.33s/it]

test：0.0, test mean: 0.2033542976939203
([5, 2, 1], [5, 2, 1])
tensor(7.3163, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.43826431035995483), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(6.7905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.028852939605713), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(6.5198, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.192559242248535), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(6.5172, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.332658767700195), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(6.5159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.446735382080078), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(6.5153, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.547221660614014), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(6.5146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 4.6344

 63%|██████▎   | 459/725 [15:16:25<13:12:57, 178.86s/it]

test：0.0, test mean: 0.2020833333333333
([2, 5, 1], [2, 5, 1])
tensor(6.9872, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 2.9883523893659003e-05), (499999, 0.0), (499999, 0.0)]


 63%|██████▎   | 460/725 [15:16:49<9:44:39, 132.38s/it] 

([5, 0, 7], [5, 0, 7])
tensor(5.6886, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.015254253521561623), (380927, 0.2102208137512207)]
([5, 0, 7], [5, 0, 7])
tensor(5.5043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -1.2862026691436768), (380927, -9.51937198638916)]
([5, 0, 7], [5, 0, 7])
tensor(3.8309, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -1.2574102878570557), (380927, -4.029047012329102)]
([5, 0, 7], [5, 0, 7])
tensor(2.2314, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -1.24784255027771), (380927, -1.9671742916107178)]
([5, 0, 0], [5, 0, 7])
tensor(1.9040, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -1.2412810325622559), (380927, -2.039938449859619)]
([5, 0, 0], [5, 0, 7])
tensor(1.8423, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -1.2346415519714355), (380927, -1.4418978691101074)]
([5, 0, 0], [5, 0, 7])
tensor(1.8221, de

 64%|██████▎   | 461/725 [15:21:48<13:21:41, 182.20s/it]

test：0.5, test mean: 0.20393374741200826
([2, 1, 3], [2, 1, 3])
tensor(6.6982, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -0.7050799131393433)]
([2, 1, 3], [2, 1, 3])
tensor(6.6389, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -2.1685047149658203)]
([2, 1, 0], [2, 1, 3])
tensor(5.8455, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -1.5669751167297363)]
([2, 1, 0], [2, 1, 3])
tensor(5.7336, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -1.5175503492355347)]
([2, 1, 0], [2, 1, 3])
tensor(5.6651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -0.8357502222061157)]
([2, 1, 0], [2, 1, 3])
tensor(5.6468, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -0.6175715923309326)]
([2, 1, 0], [2, 1, 3])
tensor(5.6417, device='cuda:0', grad_fn=<NllLossBackward0>)
[(4999

 64%|██████▎   | 462/725 [15:27:11<16:24:26, 224.59s/it]

test：0.0, test mean: 0.20267489711934153
([5, 7, 2], [5, 7, 2])
tensor(8.2243, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 3.8353802665369585e-05), (380927, 0.2102208435535431), (499999, 0.0)]
([5, 7, 2], [5, 7, 2])
tensor(7.9928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 7.670760533073917e-05), (380927, -9.273950576782227), (499999, 0.0)]
([5, 7, 2], [5, 7, 2])
tensor(6.3125, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00011506141891004518), (380927, -3.233150005340576), (499999, 0.0)]
([5, 7, 2], [5, 7, 2])
tensor(4.7754, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00015341521066147834), (380927, -5.491600036621094), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(4.3974, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00019176900968886912), (380927, -5.156315803527832), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(4.3627, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00023012282326817513), (380927, -4.814986705780029

 64%|██████▍   | 463/725 [15:32:00<17:45:16, 243.96s/it]

test：0.0, test mean: 0.20143149284253575
([0, 1, 6], [1, 1, 6])
tensor(4.1165, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 64%|██████▍   | 464/725 [15:32:25<12:55:13, 178.21s/it]

([8, 7, 7], [8, 7, 7])
tensor(6.4880, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2101881355047226), (380927, 0.2208685725927353)]
([8, 7, 7], [8, 7, 7])
tensor(6.3587, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.091806411743164), (380927, -4.464676856994629)]
([8, 7, 7], [8, 7, 7])
tensor(3.5599, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.038020133972168), (380927, 0.15409404039382935)]
([8, 0, 0], [8, 7, 7])
tensor(1.4540, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.9468393325805664), (380927, 0.135761559009552)]
([8, 0, 0], [8, 7, 7])
tensor(1.3890, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.228516578674316), (380927, 0.10246241092681885)]
([8, 0, 0], [8, 7, 7])
tensor(1.3723, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.3225860595703125), (380927, 0.07814565300941467)]
([8, 0, 0], [8, 7, 7])
tensor(1.3665, devi

 64%|██████▍   | 465/725 [15:37:12<15:13:37, 210.83s/it]

test：0.0, test mean: 0.2002032520325203
([1, 0, 2], [1, 0, 2])
tensor(6.9058, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 64%|██████▍   | 466/725 [15:37:37<11:09:53, 155.19s/it]

([0, 1, 0], [0, 1, 0])
tensor(2.4732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.007815682329237461)]


 64%|██████▍   | 467/725 [15:38:05<8:22:21, 116.83s/it] 

([5, 8, 5], [5, 8, 3])
tensor(3.5052, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 65%|██████▍   | 468/725 [15:38:31<6:24:29, 89.76s/it] 

([2, 8, 8], [2, 8, 8])
tensor(5.7870, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.5068960189819336), (499999, 0.0)]
([2, 8, 8], [2, 8, 8])
tensor(5.8743, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 4.787102222442627), (499999, 0.0)]
([2, 0, 8], [2, 8, 8])
tensor(5.0858, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 4.829245090484619), (499999, 0.0)]
([2, 0, 8], [2, 8, 8])
tensor(5.0845, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 4.803791046142578), (499999, 0.0)]
([2, 0, 8], [2, 8, 8])
tensor(5.0835, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 4.776244163513184), (499999, 0.0)]
([2, 0, 8], [2, 8, 8])
tensor(5.0826, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 4.752734661102295), (499999, 0.0)]
([2, 0, 8], [2, 8, 8])
tensor(5.0819, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 4.726177215576172), (499999, 0.0)

 65%|██████▍   | 469/725 [15:42:59<10:10:39, 143.12s/it]

test：0.0, test mean: 0.19898989898989897
([0, 1, 0], [1, 1, 0])
tensor(2.2608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 65%|██████▍   | 470/725 [15:43:24<7:38:06, 107.79s/it] 

([1, 2, 0], [1, 2, 1])
tensor(5.7288, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 65%|██████▍   | 471/725 [15:43:49<5:51:21, 83.00s/it] 

([0, 2, 0], [0, 2, 1])
tensor(3.6830, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 2.9370858101174235e-06)]


 65%|██████▌   | 472/725 [15:44:14<4:35:33, 65.35s/it]

([2, 1, 1], [2, 1, 1])
tensor(6.8582, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 65%|██████▌   | 473/725 [15:44:39<3:44:37, 53.48s/it]

([1, 7, 2], [1, 7, 2])
tensor(9.6152, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2212013602256775), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(9.7812, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -7.118375778198242), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(8.1487, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.2326769828796387), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(6.6786, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.688835382461548), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(6.6514, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.839052200317383), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(6.6449, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.969041109085083), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(6.6417, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.035601615905762), (49999

 65%|██████▌   | 474/725 [15:49:10<8:16:22, 118.66s/it]

test：0.0, test mean: 0.1977911646586345
([0, 2, 2], [0, 2, 2])
tensor(7.9522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 66%|██████▌   | 475/725 [15:49:35<6:17:03, 90.49s/it] 

([0, 2, 2], [0, 2, 2])
tensor(6.7606, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 66%|██████▌   | 476/725 [15:50:00<4:53:46, 70.79s/it]

([1, 2, 1], [1, 2, 1])
tensor(8.2574, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 66%|██████▌   | 477/725 [15:50:26<3:57:16, 57.41s/it]

([6, 0, 2], [6, 0, 2])
tensor(5.5491, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.00012272335879970342), (499999, 0.0)]


 66%|██████▌   | 478/725 [15:50:53<3:18:51, 48.30s/it]

([0, 0, 7], [0, 0, 7])
tensor(4.0633, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.21022024750709534)]
([0, 0, 7], [0, 0, 7])
tensor(3.8648, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -9.797544479370117)]
([0, 0, 7], [0, 0, 7])
tensor(2.1958, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.08880615234375)]
([0, 0, 7], [0, 0, 7])
tensor(0.6353, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.10947585105895996)]
([0, 0, 0], [0, 0, 7])
tensor(0.3068, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.26948320865631104)]
([0, 0, 0], [0, 0, 7])
tensor(0.2531, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.5383729934692383)]
([0, 0, 0], [0, 0, 7])
tensor(0.2405, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.473612

 66%|██████▌   | 479/725 [15:55:35<8:05:25, 118.40s/it]

test：0.0, test mean: 0.19660678642714569
([1, 2, 2], [1, 2, 2])
tensor(11.0759, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.004046595189720392), (499999, 0.0), (499999, 0.0)]


 66%|██████▌   | 480/725 [15:56:02<6:11:40, 91.02s/it] 

([8, 0, 2], [8, 0, 2])
tensor(2.9959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (344063, -0.0021508545614778996), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(3.1367, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (344063, -1.0531506538391113), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(2.9949, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (344063, -1.1328994035720825), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(3.1242, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (344063, -4.508420944213867), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(2.9868, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (344063, -4.533804416656494), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(3.0008, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (344063, -4.659915447235107), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(2.9851, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (344063, -4.671722888946533), (

 66%|██████▋   | 481/725 [16:00:34<9:50:33, 145.22s/it]

test：1.0, test mean: 0.20138888888888887
([1, 8, 0], [1, 8, 0])
tensor(4.6214, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, -0.0012962459586560726)]
([1, 8, 0], [1, 8, 0])
tensor(4.6761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, -2.483551025390625)]
([1, 8, 0], [1, 8, 0])
tensor(4.6210, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, -2.5017709732055664)]
([1, 8, 0], [1, 8, 0])
tensor(4.6723, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, -4.850147247314453)]
([1, 8, 0], [1, 8, 0])
tensor(4.6177, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, -4.850095272064209)]
([1, 8, 0], [1, 8, 0])
tensor(4.6230, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, -5.1661057472229)]
([1, 8, 0], [1, 8, 0])
tensor(4.6173, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999,

 66%|██████▋   | 482/725 [16:05:25<12:45:17, 188.96s/it]

test：1.0, test mean: 0.2061143984220907
([0, 2, 2], [0, 2, 2])
tensor(8.0515, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 67%|██████▋   | 483/725 [16:05:52<9:26:22, 140.42s/it] 

([1, 8, 6], [1, 8, 6])
tensor(4.3981, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 67%|██████▋   | 484/725 [16:06:19<7:07:46, 106.50s/it]

([2, 1, 5], [2, 1, 5])
tensor(8.9235, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 67%|██████▋   | 485/725 [16:06:48<5:33:12, 83.30s/it] 

([1, 2, 0], [1, 2, 0])
tensor(5.2775, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 0.08606286346912384)]
([1, 2, 0], [1, 2, 0])
tensor(5.4956, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 0.7392873764038086)]
([1, 2, 0], [1, 2, 0])
tensor(5.2657, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 1.0346534252166748)]
([1, 2, 7], [1, 2, 0])
tensor(5.9173, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.9296754598617554)]
([1, 2, 0], [1, 2, 0])
tensor(5.2414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.9225986003875732)]
([1, 2, 0], [1, 2, 0])
tensor(5.2414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.9154729843139648)]
([1, 2, 0], [1, 2, 0])
tensor(5.2414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.908386

 67%|██████▋   | 486/725 [16:11:32<9:30:57, 143.34s/it]

test：1.0, test mean: 0.21078431372549017
([5, 0, 2], [5, 0, 2])
tensor(4.5370, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 67%|██████▋   | 487/725 [16:11:57<7:07:46, 107.84s/it]

([1, 1, 5], [1, 1, 8])
tensor(4.4409, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 67%|██████▋   | 488/725 [16:12:22<5:27:38, 82.95s/it] 

([7, 2, 2], [7, 2, 2])
tensor(14.3244, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21022041141986847), (499999, 2.643628431542311e-05), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(14.1317, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.273916244506836), (499999, 5.287256863084622e-05), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(12.4396, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.979618787765503), (499999, 7.930885476525873e-05), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(10.9101, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.059732437133789), (499999, 0.00010574513726169243), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(10.5355, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.651154041290283), (499999, 0.00013218143431004137), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(10.4801, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.396872043609619), (499999, 0.00015861770953051746), (499999, 0.0)]
([0, 2, 2], [7, 2,

 67%|██████▋   | 489/725 [16:17:02<9:19:13, 142.17s/it]

test：0.0, test mean: 0.20955165692007793
([0, 2, 2], [0, 2, 2])
tensor(8.1683, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 68%|██████▊   | 490/725 [16:17:27<6:59:36, 107.14s/it]

([0, 5, 5], [0, 5, 6])
tensor(2.4207, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.004242892377078533), (499999, 0.0)]


 68%|██████▊   | 491/725 [16:17:53<5:22:32, 82.70s/it] 

([1, 2, 3], [1, 2, 4])
tensor(8.8740, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 3.782420390052721e-05), (499999, 0.0)]


 68%|██████▊   | 492/725 [16:18:20<4:15:49, 65.88s/it]

([1, 1, 2], [1, 3, 2])
tensor(4.9777, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, -0.4683351516723633), (499999, 0.0)]
([1, 1, 2], [1, 3, 2])
tensor(4.8444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 0.9104543924331665), (499999, 0.0)]
([1, 1, 2], [1, 3, 2])
tensor(4.6322, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 1.266677975654602), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(4.4057, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 0.9239541292190552), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(4.3614, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 0.7886276245117188), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(4.3542, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 0.8156730532646179), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(4.3495, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 0.690613865852356), (499999,

 68%|██████▊   | 493/725 [16:23:02<8:26:04, 130.88s/it]

test：0.0, test mean: 0.20833333333333331
([2, 1, 0], [2, 1, 0])
tensor(5.3247, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, 0.015771906822919846)]
([2, 1, 7], [2, 1, 0])
tensor(6.0095, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, -10.92629623413086)]
([2, 1, 0], [2, 1, 0])
tensor(5.2969, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, -10.980365753173828)]
([2, 1, 0], [2, 1, 0])
tensor(5.2962, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, -11.030943870544434)]
([2, 1, 0], [2, 1, 0])
tensor(5.2957, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, -11.080448150634766)]
([2, 1, 0], [2, 1, 0])
tensor(5.2952, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, -11.12889289855957)]
([2, 1, 0], [2, 1, 0])
tensor(5.2949, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49999

 68%|██████▊   | 494/725 [16:27:52<11:27:24, 178.55s/it]

test：1.0, test mean: 0.21290944123314062
([1, 2, 8], [1, 2, 8])
tensor(6.9661, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 68%|██████▊   | 495/725 [16:28:20<8:30:40, 133.22s/it] 

([2, 7, 2], [2, 7, 2])
tensor(10.9924, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2102210521697998), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(10.8022, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.797592163085938), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(9.1276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.088674545288086), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(7.5585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.01543271541595459), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(7.1992, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.9123131036758423), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(7.1428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.5297614336013794), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(7.1237, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.6649339199066162), 

 68%|██████▊   | 496/725 [16:32:50<11:06:00, 174.50s/it]

test：0.0, test mean: 0.21168582375478925
([3, 8, 2], [3, 8, 2])
tensor(5.0115, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 0.40667909383773804), (499999, 0.0)]
([3, 1, 2], [3, 8, 2])
tensor(5.0433, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 7.443772315979004), (499999, 0.0)]
([3, 0, 2], [3, 8, 2])
tensor(4.2124, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 7.622020721435547), (499999, 0.0)]
([3, 0, 2], [3, 8, 2])
tensor(4.2008, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 7.728865623474121), (499999, 0.0)]
([3, 0, 2], [3, 8, 2])
tensor(4.1980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 7.859615325927734), (499999, 0.0)]
([3, 0, 2], [3, 8, 2])
tensor(4.1965, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 7.964720249176025), (499999, 0.0)]
([3, 0, 2], [3, 8, 2])
tensor(4.1952, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), 

 69%|██████▊   | 497/725 [16:37:35<13:09:08, 207.67s/it]

test：0.0, test mean: 0.21047619047619046
([2, 8, 2], [2, 8, 2])
tensor(10.6048, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 69%|██████▊   | 498/725 [16:38:00<9:38:09, 152.82s/it] 

([7, 1, 2], [7, 1, 2])
tensor(4.8732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 69%|██████▉   | 499/725 [16:38:25<7:11:13, 114.48s/it]

([1, 2, 5], [1, 2, 1])
tensor(8.3954, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 69%|██████▉   | 500/725 [16:38:50<5:27:59, 87.46s/it] 

([5, 2, 5], [1, 2, 0])
tensor(4.9043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.001239950768649578), (499999, 0.0), (499999, 0.0)]


 69%|██████▉   | 501/725 [16:39:16<4:17:43, 69.03s/it]

([1, 2, 2], [1, 2, 2])
tensor(12.3911, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 69%|██████▉   | 502/725 [16:39:40<3:26:40, 55.61s/it]

([8, 1, 1], [8, 1, 1])
tensor(7.1386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.007454405073076487)]


 69%|██████▉   | 503/725 [16:40:05<2:52:01, 46.49s/it]

([7, 8, 5], [7, 8, 0])
tensor(4.0336, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.04087238758802414), (499999, 0.0)]


 70%|██████▉   | 504/725 [16:40:33<2:30:55, 40.97s/it]

([0, 6, 0], [0, 6, 0])
tensor(1.7009, device='cuda:0', grad_fn=<NllLossBackward0>)
[(356351, 0.019636694341897964), (499999, 0.0), (479231, -0.043980538845062256)]
([0, 6, 7], [0, 6, 0])
tensor(2.2494, device='cuda:0', grad_fn=<NllLossBackward0>)
[(356351, -0.6829824447631836), (499999, 0.0), (479231, -0.006554901599884033)]
([0, 6, 0], [0, 6, 0])
tensor(1.4486, device='cuda:0', grad_fn=<NllLossBackward0>)
[(356351, -0.32714301347732544), (499999, 0.0), (479231, 1.2218897342681885)]
([0, 6, 0], [0, 6, 0])
tensor(1.4961, device='cuda:0', grad_fn=<NllLossBackward0>)
[(356351, 0.4181976318359375), (499999, 0.0), (479231, 1.6830172538757324)]
([0, 6, 0], [0, 6, 0])
tensor(1.3890, device='cuda:0', grad_fn=<NllLossBackward0>)
[(356351, 0.4361439645290375), (499999, 0.0), (479231, 2.010836124420166)]
([0, 6, 0], [0, 6, 0])
tensor(1.3860, device='cuda:0', grad_fn=<NllLossBackward0>)
[(356351, 0.4511781930923462), (499999, 0.0), (479231, 2.271709442138672)]
([0, 6, 0], [0, 6, 0])
tensor(1.3848,

 70%|██████▉   | 505/725 [16:44:58<6:35:53, 107.97s/it]

test：1.0, test mean: 0.2149621212121212
([2, 1, 2], [2, 1, 2])
tensor(11.5576, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 70%|██████▉   | 506/725 [16:45:23<5:03:12, 83.07s/it] 

([8, 2, 3], [8, 2, 3])
tensor(6.6225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.00017367035616189241), (454655, -0.8360863327980042)]
([8, 2, 3], [8, 2, 3])
tensor(6.6233, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.00034734071232378483), (454655, -2.7396340370178223)]
([8, 2, 8], [8, 2, 3])
tensor(5.7868, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0005210111849009991), (454655, -3.4089245796203613)]
([8, 2, 0], [8, 2, 3])
tensor(5.3977, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0006946814246475697), (454655, -2.0773110389709473)]
([8, 2, 0], [8, 2, 3])
tensor(5.3351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0008683553896844387), (454655, -1.4700028896331787)]
([8, 2, 0], [8, 2, 3])
tensor(5.3236, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0010420242324471474), (454655, -1.176987648010254)]
([8, 2, 0], [8,

 70%|██████▉   | 507/725 [16:49:58<8:31:02, 140.65s/it]

test：0.0, test mean: 0.21374764595103576
([1, 0, 1], [1, 0, 1])
tensor(2.8678, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.003313793335109949), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(2.8952, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.6631028652191162), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(2.8678, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.683922290802002), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(2.8861, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.1004945039749146), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(2.8659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.1046990156173706), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(2.8709, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.4096622467041016), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(2.8656, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499

 70%|███████   | 508/725 [16:54:24<10:45:37, 178.51s/it]

test：1.0, test mean: 0.21816479400749061
([2, 7, 2], [2, 7, 2])
tensor(10.6738, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.3479316830635071), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(11.0313, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.6599440574645996), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(9.1205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.177480697631836), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(7.2175, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.576391220092773), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(7.0502, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.6117026805877686), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(6.9949, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.139673233032227), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(6.9867, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999

 70%|███████   | 509/725 [16:59:01<12:28:09, 207.82s/it]

test：0.0, test mean: 0.21694599627560518
([2, 5, 8], [2, 5, 8])
tensor(7.3972, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.001359602203592658), (499999, 0.0), (499999, 0.0)]


 70%|███████   | 510/725 [16:59:28<9:10:22, 153.59s/it] 

([2, 8, 8], [2, 8, 0])
tensor(5.5162, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0036624937783926725), (196607, 0.3685833811759949), (499999, 0.0)]
([2, 7, 8], [2, 8, 0])
tensor(5.5850, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.007324987556785345), (196607, 4.06245756149292), (499999, 0.0)]
([2, 0, 8], [2, 8, 0])
tensor(4.7707, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.010987482033669949), (196607, 4.175438404083252), (499999, 0.0)]
([2, 0, 8], [2, 8, 0])
tensor(4.7652, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.01464997511357069), (196607, 4.247312545776367), (499999, 0.0)]
([2, 0, 8], [2, 8, 0])
tensor(4.7640, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.018312469124794006), (196607, 4.304777145385742), (499999, 0.0)]
([2, 0, 8], [2, 8, 0])
tensor(4.7626, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.021974962204694748), (196607, 4.33359956741333), (499999, 0.0)]
([2, 0, 8], [2, 8, 0])
tensor(4.7622, dev

 70%|███████   | 511/725 [17:04:16<11:31:59, 194.02s/it]

test：0.0, test mean: 0.2157407407407407
([2, 7, 3], [2, 7, 3])
tensor(7.6150, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21018937230110168), (208895, -0.6021767854690552)]
([2, 7, 3], [2, 7, 3])
tensor(7.5809, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -8.257953643798828), (208895, -1.9329795837402344)]
([2, 7, 0], [2, 7, 3])
tensor(5.3857, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.1155195236206055), (208895, -1.9877371788024902)]
([2, 0, 0], [2, 7, 3])
tensor(4.1560, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.575228214263916), (208895, -0.7362044453620911)]
([2, 0, 0], [2, 7, 3])
tensor(3.9010, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.7909650802612305), (208895, -0.28303277492523193)]
([2, 0, 0], [2, 7, 3])
tensor(3.8764, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.939471483230591), (208895, -0.147441804409027

 71%|███████   | 512/725 [17:09:41<13:47:50, 233.19s/it]

test：0.0, test mean: 0.21454880294659298
([7, 6, 3], [7, 6, 3])
tensor(6.1575, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.2101878970861435), (499999, 0.0), (499999, 0.0)]
([7, 6, 3], [7, 6, 3])
tensor(6.1580, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.091907501220703), (499999, 0.0), (499999, 0.0)]
([7, 6, 3], [7, 6, 3])
tensor(4.5796, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.906617164611816), (499999, 0.0), (499999, 0.0)]
([0, 6, 3], [7, 6, 3])
tensor(3.2506, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.805576324462891), (499999, 0.0), (499999, 0.0)]
([0, 6, 3], [7, 6, 3])
tensor(3.2263, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.990981101989746), (499999, 0.0), (499999, 0.0)]
([0, 6, 3], [7, 6, 3])
tensor(3.2202, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.116344451904297), (499999, 0.0), (499999, 0.0)]
([0, 6, 3], [7, 6, 3])
tensor(3.2166, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5

 71%|███████   | 513/725 [17:14:03<14:14:58, 241.98s/it]

test：0.0, test mean: 0.21336996336996333
([7, 8, 0], [7, 8, 0])
tensor(4.1517, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21019233763217926), (499999, 0.0), (499999, 0.0)]
([7, 8, 0], [7, 8, 0])
tensor(4.1379, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.092008590698242), (499999, 0.0), (499999, 0.0)]
([7, 8, 0], [7, 8, 0])
tensor(2.5717, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.902656078338623), (499999, 0.0), (499999, 0.0)]
([0, 8, 0], [7, 8, 0])
tensor(1.2144, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.727441787719727), (499999, 0.0), (499999, 0.0)]
([0, 8, 0], [7, 8, 0])
tensor(1.1896, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.951650619506836), (499999, 0.0), (499999, 0.0)]
([0, 8, 0], [7, 8, 0])
tensor(1.1825, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.0763139724731445), (499999, 0.0), (499999, 0.0)]
([0, 8, 0], [7, 8, 0])
tensor(1.1806, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 

 71%|███████   | 514/725 [17:18:29<14:36:17, 249.18s/it]

test：0.0, test mean: 0.21220400728597447
([0, 2, 2], [0, 2, 2])
tensor(7.9088, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.0047390758991241455), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.9189, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.5366547107696533), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.9053, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.6787313222885132), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.9079, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.9315896034240723), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.8945, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.990854024887085), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.8938, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.042830228805542), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(7.8931, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380

 71%|███████   | 515/725 [17:22:54<14:48:42, 253.91s/it]

test：1.0, test mean: 0.21648550724637677
([0, 2, 0], [0, 2, 0])
tensor(4.2756, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.001854530069977045), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(4.3357, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 1.3818552494049072), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(4.2756, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 1.3533494472503662), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(4.3246, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 2.776010513305664), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(4.2707, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 2.8264312744140625), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(4.2704, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 2.8225531578063965), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [0, 2, 0])
tensor(4.2699, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 

 71%|███████   | 516/725 [17:27:26<15:02:56, 259.22s/it]

test：1.0, test mean: 0.2207207207207207
([1, 0, 5], [1, 0, 0])
tensor(1.9471, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 2.0433588360901922e-05)]


 71%|███████▏  | 517/725 [17:27:50<10:54:09, 188.70s/it]

([2, 5, 2], [2, 5, 2])
tensor(7.6818, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 71%|███████▏  | 518/725 [17:28:13<7:59:36, 139.02s/it] 

([2, 8, 1], [2, 8, 1])
tensor(9.9378, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 72%|███████▏  | 519/725 [17:28:37<5:58:30, 104.42s/it]

([5, 2, 7], [7, 2, 7])
tensor(9.2481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.20988187193870544)]
([5, 2, 7], [7, 2, 7])
tensor(9.1374, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -8.152772903442383)]
([5, 2, 7], [7, 2, 7])
tensor(7.5917, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.185442924499512)]
([5, 2, 0], [7, 2, 7])
tensor(6.8408, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.2191386222839355)]
([5, 2, 0], [7, 2, 7])
tensor(6.8396, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.313924312591553)]
([5, 2, 0], [7, 2, 7])
tensor(6.8388, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.338151931762695)]
([5, 2, 0], [7, 2, 7])
tensor(6.8378, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.40248632

 72%|███████▏  | 520/725 [17:33:35<9:15:57, 162.72s/it]

test：0.0, test mean: 0.21953405017921143
([7, 2, 2], [7, 2, 2])
tensor(11.2646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 72%|███████▏  | 521/725 [17:34:00<6:52:30, 121.33s/it]

([0, 8, 0], [0, 8, 0])
tensor(1.9420, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0010813865810632706), (499999, 0.0), (499999, 0.0)]


 72%|███████▏  | 522/725 [17:34:23<5:10:23, 91.74s/it] 

([7, 0, 0], [7, 0, 0])
tensor(2.0405, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.34697335958480835), (499999, 0.0), (499999, 0.00028169213328510523)]
([7, 0, 0], [7, 0, 0])
tensor(2.2662, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.329655647277832), (499999, 0.0), (499999, 0.0005633842665702105)]
([7, 0, 0], [7, 0, 0])
tensor(1.1733, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 7.522727012634277), (499999, 0.0), (499999, 0.0008450764580629766)]
([0, 0, 0], [7, 0, 0])
tensor(0.1599, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 7.495619773864746), (499999, 0.0), (499999, 0.001126768533140421)]
([0, 0, 0], [7, 0, 0])
tensor(0.1597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 7.464514255523682), (499999, 0.0), (499999, 0.001408460782840848)]
([0, 0, 0], [7, 0, 0])
tensor(0.1594, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 7.457537651062012), (499999, 0.0), (499999, 0.0016901527997106314)]
([0, 0, 0], [7, 0, 0])
tensor(0.

 72%|███████▏  | 523/725 [17:38:51<8:07:16, 144.74s/it]

test：0.0, test mean: 0.21836007130124774
([1, 8, 0], [1, 8, 1])
tensor(3.9980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -0.2415592074394226)]
([1, 8, 1], [1, 8, 1])
tensor(4.3838, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, 4.451891899108887)]
([1, 8, 0], [1, 8, 1])
tensor(3.9950, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, 3.407151222229004)]
([1, 8, 1], [1, 8, 1])
tensor(4.2285, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, 4.759450912475586)]
([1, 8, 0], [1, 8, 1])
tensor(3.6792, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, 4.680842399597168)]
([1, 8, 0], [1, 8, 1])
tensor(3.6640, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, 4.6490583419799805)]
([1, 8, 0], [1, 8, 1])
tensor(3.6630, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0),

 72%|███████▏  | 524/725 [17:44:08<10:57:50, 196.37s/it]

test：0.0, test mean: 0.21719858156028365
([7, 2, 8], [7, 2, 8])
tensor(7.0577, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.20843657851219177), (499999, 0.0), (499999, 0.0)]
([7, 2, 8], [7, 2, 8])
tensor(6.8763, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.638137817382812), (499999, 0.0), (499999, 0.0)]
([7, 2, 8], [7, 2, 8])
tensor(5.3022, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.317296981811523), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [7, 2, 8])
tensor(4.8755, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.12848949432373), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [7, 2, 8])
tensor(4.8738, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.061635971069336), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [7, 2, 8])
tensor(4.8735, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.020709037780762), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [7, 2, 8])
tensor(4.8732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -7

 72%|███████▏  | 525/725 [17:48:29<11:59:22, 215.81s/it]

test：0.0, test mean: 0.21604938271604937
([0, 7, 1], [0, 7, 1])
tensor(6.2734, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21022114157676697), (499999, 0.0)]
([0, 7, 1], [0, 7, 1])
tensor(6.1497, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.519384384155273), (499999, 0.0)]
([0, 7, 1], [0, 7, 1])
tensor(4.5497, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.025752544403076), (499999, 0.0)]
([0, 7, 1], [0, 7, 1])
tensor(2.9134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.8553718328475952), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(2.5457, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.2124266624450684), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(2.4906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.7495250701904297), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(2.4648, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999

 73%|███████▎  | 526/725 [17:53:13<13:03:01, 236.09s/it]

test：0.0, test mean: 0.21491228070175436
([0, 1, 1], [0, 1, 1])
tensor(4.2775, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.02147303894162178), (499999, 0.0), (499999, 0.0)]


 73%|███████▎  | 527/725 [17:53:37<9:29:41, 172.63s/it] 

([2, 6, 2], [2, 6, 2])
tensor(10.9854, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 73%|███████▎  | 528/725 [17:54:03<7:01:52, 128.49s/it]

([2, 3, 7], [2, 3, 7])
tensor(8.0776, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -0.6287513971328735), (380927, 0.2102203667163849)]
([2, 3, 7], [2, 3, 7])
tensor(8.0570, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -1.9606585502624512), (380927, -9.797582626342773)]
([2, 3, 7], [2, 3, 7])
tensor(6.3713, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -3.2186923027038574), (380927, -4.08682918548584)]
([2, 3, 7], [2, 3, 7])
tensor(4.6855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -3.858346939086914), (380927, -1.2632720470428467)]
([2, 0, 0], [2, 3, 7])
tensor(3.7068, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -3.7457756996154785), (380927, -1.755258321762085)]
([2, 0, 0], [2, 3, 7])
tensor(3.5465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -2.6263792514801025), (380927, -1.8662594556808472)]
([2, 0, 0], [2, 3, 7])
tensor(3.5001, de

 73%|███████▎  | 529/725 [17:59:06<9:51:14, 180.99s/it]

test：0.0, test mean: 0.2137870855148342
([0, 5, 0], [0, 0, 0])
tensor(0.9730, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.00688801147043705)]
([0, 5, 0], [0, 0, 0])
tensor(1.1810, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.77846622467041)]
([0, 5, 0], [0, 0, 0])
tensor(0.9711, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -5.023991107940674)]
([0, 5, 0], [0, 0, 0])
tensor(1.1425, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -9.312887191772461)]
([0, 5, 0], [0, 0, 0])
tensor(0.9477, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -9.35993480682373)]
([0, 5, 0], [0, 0, 0])
tensor(0.9748, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -11.044719696044922)]
([0, 5, 0], [0, 0, 0])
tensor(0.9454, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.

 73%|███████▎  | 530/725 [18:03:51<11:29:05, 212.03s/it]

test：1.0, test mean: 0.21788194444444442
([0, 0, 7], [1, 0, 7])
tensor(3.2137, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.35630252957344055), (491519, 0.006106872111558914), (380927, 0.22119905054569244)]
([1, 0, 7], [1, 0, 7])
tensor(4.6954, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 2.3156323432922363), (491519, 0.7396910190582275), (380927, -8.553781509399414)]
([0, 0, 7], [1, 0, 7])
tensor(1.6273, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 1.2334043979644775), (491519, 0.8572513461112976), (380927, -5.560672283172607)]
([0, 0, 0], [1, 0, 7])
tensor(0.2950, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.6364792585372925), (491519, 2.373809337615967), (380927, -5.94981575012207)]
([0, 0, 0], [1, 0, 7])
tensor(0.1145, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -1.082261562347412), (491519, 2.3879542350769043), (380927, -6.394306182861328)]
([0, 0, 0], [1, 0, 7])
tensor(0.0775, device='cuda:0', grad_fn=<NllLossBackward0>)
[(

 73%|███████▎  | 531/725 [18:08:37<12:38:00, 234.44s/it]

test：0.3333333333333333, test mean: 0.21848013816925732
([2, 2, 8], [2, 2, 8])
tensor(8.9640, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 73%|███████▎  | 532/725 [18:09:02<9:12:03, 171.63s/it] 

([5, 2, 5], [0, 2, 8])
tensor(6.7564, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0036930739879608154), (499999, 0.0), (499999, 0.0)]


 74%|███████▎  | 533/725 [18:09:27<6:48:22, 127.62s/it]

([2, 3, 3], [2, 3, 3])
tensor(7.7051, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 74%|███████▎  | 534/725 [18:09:53<5:08:25, 96.89s/it] 

([2, 5, 5], [2, 5, 5])
tensor(5.0275, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 74%|███████▍  | 535/725 [18:10:20<4:00:35, 75.97s/it]

([7, 8, 5], [7, 8, 3])
tensor(5.0863, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.22049565613269806), (499999, 0.0), (499999, 0.0)]
([7, 8, 5], [7, 8, 3])
tensor(5.1652, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.49070930480957), (499999, 0.0), (499999, 0.0)]
([7, 8, 5], [7, 8, 3])
tensor(3.4397, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.955259084701538), (499999, 0.0), (499999, 0.0)]
([0, 8, 5], [7, 8, 3])
tensor(2.8645, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -4.00272274017334), (499999, 0.0), (499999, 0.0)]
([0, 8, 5], [7, 8, 3])
tensor(2.8627, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -4.0976715087890625), (499999, 0.0), (499999, 0.0)]
([0, 8, 5], [7, 8, 3])
tensor(2.8614, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -4.215960502624512), (499999, 0.0), (499999, 0.0)]
([0, 8, 5], [7, 8, 3])
tensor(2.8607, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -4.220043659210205), (499999, 0.0), (499999

 74%|███████▍  | 536/725 [18:14:45<6:58:18, 132.80s/it]

test：0.0, test mean: 0.21735395189003434
([0, 2, 7], [0, 2, 7])
tensor(5.9646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -0.009422794915735722), (499999, 0.0), (380927, -0.3478888273239136)]
([0, 2, 7], [0, 2, 7])
tensor(6.4235, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -0.06996240466833115), (499999, 0.0), (380927, -1.0594639778137207)]
([0, 2, 7], [0, 2, 7])
tensor(4.8315, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -0.06391146779060364), (499999, 0.0), (380927, 3.305525779724121)]
([0, 2, 0], [0, 2, 7])
tensor(3.0372, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -0.0737118124961853), (499999, 0.0), (380927, 2.472813129425049)]
([0, 2, 0], [0, 2, 7])
tensor(3.0230, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -0.06877648830413818), (499999, 0.0), (380927, 2.2537264823913574)]
([0, 2, 0], [0, 2, 7])
tensor(3.0202, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -0.074170783162117), (499999, 0.0), (380927, 2.04559707641601

 74%|███████▍  | 537/725 [18:19:31<9:20:02, 178.73s/it]

test：0.5, test mean: 0.2188034188034188
([5, 5, 5], [0, 1, 5])
tensor(3.1874, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 74%|███████▍  | 538/725 [18:19:58<6:54:52, 133.12s/it]

([1, 1, 0], [1, 0, 0])
tensor(3.0791, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 74%|███████▍  | 539/725 [18:20:23<5:12:04, 100.67s/it]

([0, 7, 0], [0, 7, 0])
tensor(2.5802, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.2098212093114853), (499999, 0.0)]
([0, 7, 0], [0, 7, 0])
tensor(2.4630, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -9.356391906738281), (499999, 0.0)]
([0, 7, 0], [0, 7, 0])
tensor(0.8351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.657735824584961), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.2063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.580777168273926), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.2054, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.540778160095215), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.2050, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.515974998474121), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.2047, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.489436149597168), (499999

 74%|███████▍  | 540/725 [18:25:11<8:04:01, 156.98s/it]

test：0.0, test mean: 0.21768707482993196
([5, 0, 0], [5, 0, 1])
tensor(1.2032, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 75%|███████▍  | 541/725 [18:25:37<6:01:14, 117.80s/it]

([0, 2, 7], [0, 2, 7])
tensor(6.7800, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0003862857411149889), (499999, 0.0), (499999, 0.0)]


 75%|███████▍  | 542/725 [18:26:05<4:36:26, 90.63s/it] 

([2, 1, 3], [2, 1, 3])
tensor(7.0338, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.016367027536034584)]


 75%|███████▍  | 543/725 [18:26:31<3:36:40, 71.43s/it]

([8, 6, 7], [8, 6, 7])
tensor(3.1186, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 75%|███████▌  | 544/725 [18:26:58<2:55:07, 58.06s/it]

([1, 2, 0], [1, 2, 0])
tensor(4.3028, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 75%|███████▌  | 545/725 [18:27:25<2:25:49, 48.61s/it]

([8, 0, 3], [7, 0, 3])
tensor(1.4182, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.008196447975933552), (499999, 0.0)]
([8, 0, 3], [7, 0, 3])
tensor(1.4181, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.04842090606689453), (499999, 0.0)]
([8, 0, 3], [7, 0, 3])
tensor(1.4155, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.04879850149154663), (499999, 0.0)]
([8, 0, 3], [7, 0, 3])
tensor(1.4149, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.05331575870513916), (499999, 0.0)]
([8, 0, 3], [7, 0, 3])
tensor(1.4147, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.0604027584195137), (499999, 0.0)]
([8, 0, 3], [7, 0, 3])
tensor(1.4146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.0629718154668808), (499999, 0.0)]
([8, 0, 3], [7, 0, 3])
tensor(1.4145, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.06898199021816

 75%|███████▌  | 546/725 [18:32:10<5:56:38, 119.54s/it]

test：1.0, test mean: 0.22165820642978
([2, 0, 8], [2, 0, 8])
tensor(5.1191, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.002413438633084297), (499999, 0.0)]
([2, 1, 8], [2, 0, 8])
tensor(5.5052, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.661821722984314), (499999, 0.0)]
([2, 0, 8], [2, 0, 8])
tensor(5.1174, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.5995994210243225), (499999, 0.0)]
([2, 1, 8], [2, 0, 8])
tensor(5.4405, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 3.328319549560547), (499999, 0.0)]
([2, 0, 8], [2, 0, 8])
tensor(5.1143, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 3.3045735359191895), (499999, 0.0)]
([2, 0, 8], [2, 0, 8])
tensor(5.2442, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 3.237579822540283), (499999, 0.0)]
([2, 0, 8], [2, 0, 8])
tensor(5.1133, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), 

 75%|███████▌  | 547/725 [18:36:46<8:14:26, 166.67s/it]

test：1.0, test mean: 0.2255892255892256
([7, 8, 0], [7, 8, 1])
tensor(2.2046, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 76%|███████▌  | 548/725 [18:37:11<6:06:05, 124.10s/it]

([7, 0, 8], [7, 8, 8])
tensor(2.8780, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 76%|███████▌  | 549/725 [18:37:35<4:35:32, 93.94s/it] 

([2, 3, 0], [2, 7, 0])
tensor(4.0244, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 76%|███████▌  | 550/725 [18:37:58<3:32:28, 72.85s/it]

([2, 1, 0], [2, 1, 7])
tensor(5.4089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 76%|███████▌  | 551/725 [18:38:24<2:50:13, 58.70s/it]

([7, 1, 2], [7, 1, 2])
tensor(6.9657, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.3587718605995178), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(7.4948, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -2.355659008026123), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(6.0831, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 4.460549831390381), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(4.7998, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 4.113632678985596), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(4.7977, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 3.9401142597198486), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(4.7969, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 3.7714993953704834), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(4.7966, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 3.6111562252044678), (499999, 0.0), (499999,

 76%|███████▌  | 552/725 [18:42:53<5:50:55, 121.71s/it]

test：0.0, test mean: 0.22445561139028475
([8, 0, 2], [8, 0, 2])
tensor(4.8247, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.11236236989498138), (499999, 0.0), (499999, 0.0)]
([1, 0, 2], [8, 0, 2])
tensor(5.1731, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 4.290914535522461), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [8, 0, 2])
tensor(4.3424, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 4.375716209411621), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [8, 0, 2])
tensor(4.3377, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 4.423269271850586), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [8, 0, 2])
tensor(4.3359, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 4.398327827453613), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [8, 0, 2])
tensor(4.3343, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 4.36875581741333), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [8, 0, 2])
tensor(4.3336, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 4.34669

 76%|███████▋  | 553/725 [18:47:27<8:00:12, 167.51s/it]

test：0.0, test mean: 0.22333333333333333
([5, 2, 2], [8, 2, 2])
tensor(9.4280, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 76%|███████▋  | 554/725 [18:47:52<5:55:00, 124.56s/it]

([6, 2, 2], [6, 2, 2])
tensor(10.8767, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 555/725 [18:48:16<4:27:30, 94.41s/it] 

([5, 1, 8], [1, 1, 8])
tensor(4.7254, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.008355113677680492), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 556/725 [18:48:39<3:26:19, 73.25s/it]

([8, 2, 1], [8, 2, 1])
tensor(7.3155, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 557/725 [18:49:05<2:44:38, 58.80s/it]

([8, 5, 5], [8, 2, 8])
tensor(3.8244, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.005959759466350079), (499999, 0.0)]


 77%|███████▋  | 558/725 [18:49:30<2:15:51, 48.81s/it]

([5, 1, 6], [5, 1, 6])
tensor(4.9197, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 559/725 [18:49:56<1:55:49, 41.87s/it]

([1, 1, 2], [1, 1, 2])
tensor(6.4279, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 560/725 [18:50:22<1:42:31, 37.28s/it]

([8, 1, 2], [8, 1, 2])
tensor(9.4388, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 561/725 [18:50:51<1:34:32, 34.59s/it]

([5, 6, 8], [5, 6, 8])
tensor(5.0266, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 562/725 [18:51:18<1:27:47, 32.31s/it]

([2, 1, 0], [2, 1, 1])
tensor(4.1986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 563/725 [18:51:43<1:21:56, 30.35s/it]

([2, 6, 3], [2, 6, 3])
tensor(5.8803, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 564/725 [18:52:10<1:18:10, 29.13s/it]

([3, 2, 8], [3, 2, 8])
tensor(6.0513, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 565/725 [18:52:38<1:16:49, 28.81s/it]

([7, 8, 7], [7, 8, 7])
tensor(7.8971, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.35894137620925903), (499999, 0.0), (380927, 0.21020342409610748)]
([7, 8, 7], [7, 8, 7])
tensor(8.1484, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.076993942260742), (499999, 0.0), (380927, -9.092448234558105)]
([7, 8, 7], [7, 8, 7])
tensor(4.7546, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 4.5624799728393555), (499999, 0.0), (380927, -4.257225513458252)]
([7, 8, 0], [7, 8, 7])
tensor(1.3163, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.4294092059135437), (499999, 0.0), (380927, -4.401902198791504)]
([0, 8, 0], [7, 8, 7])
tensor(1.0496, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.2139899730682373), (499999, 0.0), (380927, -4.192382335662842)]
([0, 8, 0], [7, 8, 7])
tensor(0.9953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.3010559678077698), (499999, 0.0), (380927, -4.177061557769775)]
([0, 8, 0], [7, 8, 7])
tensor(0.9775, devic

 78%|███████▊  | 566/725 [18:57:44<4:56:45, 111.98s/it]

test：0.0, test mean: 0.2222222222222222
([0, 2, 2], [0, 2, 2])
tensor(10.0606, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 567/725 [18:58:11<3:47:47, 86.51s/it] 

([1, 2, 1], [1, 2, 1])
tensor(5.7179, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 568/725 [18:58:36<2:58:17, 68.14s/it]

([1, 7, 2], [1, 7, 2])
tensor(7.1859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 569/725 [18:59:03<2:25:10, 55.84s/it]

([5, 0, 2], [5, 0, 2])
tensor(3.8485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00013104754907544702), (454655, -0.00521494448184967), (499999, 0.0)]
([5, 0, 2], [5, 0, 2])
tensor(3.8907, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00026209509815089405), (454655, -1.0465807914733887), (499999, 0.0)]
([5, 0, 2], [5, 0, 2])
tensor(3.8486, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0003931429819203913), (454655, -1.0627273321151733), (499999, 0.0)]
([5, 0, 2], [5, 0, 2])
tensor(3.8826, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0005241901963017881), (454655, -1.567622423171997), (499999, 0.0)]
([5, 0, 2], [5, 0, 2])
tensor(3.8463, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0006552378181368113), (454655, -1.5150632858276367), (499999, 0.0)]
([5, 0, 2], [5, 0, 2])
tensor(3.8448, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.000786285731010139), (454655, -1.5348756313323975), (499999, 0.0)]
([5, 0, 2], [5, 0, 2]

 79%|███████▊  | 570/725 [19:03:45<5:19:14, 123.58s/it]

test：1.0, test mean: 0.22607260726072606
([2, 8, 1], [2, 8, 1])
tensor(7.1465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 79%|███████▉  | 571/725 [19:04:10<4:01:04, 93.92s/it] 

([1, 5, 2], [1, 5, 2])
tensor(6.0810, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 79%|███████▉  | 572/725 [19:04:36<3:07:56, 73.70s/it]

([7, 0, 2], [7, 0, 2])
tensor(6.1396, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.2211945354938507), (380927, 0.021739032119512558), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(6.3919, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.118013858795166), (380927, -2.0829687118530273), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(4.6124, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.2393085956573486), (380927, -1.7634683847427368), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(3.2190, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.1641757488250732), (380927, -1.6500070095062256), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(3.1954, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.2943215370178223), (380927, -1.5697832107543945), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(3.1872, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.4222373962402344), (380927, -1.5086404085159302), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(3.182

 79%|███████▉  | 573/725 [19:09:03<5:33:40, 131.72s/it]

test：0.5, test mean: 0.22742200328407225
([5, 2, 7], [8, 2, 7])
tensor(7.1911, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.009492152370512486), (499999, 0.0), (380927, 0.21020299196243286)]
([5, 2, 7], [8, 2, 7])
tensor(7.0924, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.01898430474102497), (499999, 0.0), (380927, -9.092418670654297)]
([5, 2, 7], [8, 2, 7])
tensor(5.5214, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.02847645804286003), (499999, 0.0), (380927, -4.881452560424805)]
([5, 2, 0], [8, 2, 7])
tensor(4.0515, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.03796860948204994), (499999, 0.0), (380927, -4.6029863357543945)]
([5, 2, 0], [8, 2, 7])
tensor(4.0134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.04746076092123985), (499999, 0.0), (380927, -4.480883598327637)]
([5, 2, 0], [8, 2, 7])
tensor(4.0070, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.056952912360429764), (499999, 0.0), (380927, -4.4330186

 79%|███████▉  | 574/725 [19:13:52<7:29:43, 178.70s/it]

test：0.0, test mean: 0.22630718954248366
([0, 3, 0], [0, 3, 0])
tensor(1.3933, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 79%|███████▉  | 575/725 [19:14:17<5:31:36, 132.65s/it]

([1, 1, 3], [1, 1, 3])
tensor(5.7618, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.011045197956264019), (196607, -0.6987932920455933)]
([1, 1, 3], [1, 1, 3])
tensor(5.7832, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.022090395912528038), (196607, -1.1906895637512207)]
([1, 1, 3], [1, 1, 3])
tensor(5.4408, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.03313559293746948), (196607, -0.5228140950202942)]
([1, 1, 0], [1, 1, 3])
tensor(4.8276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.044180791825056076), (196607, 0.4460177421569824)]
([1, 1, 0], [1, 1, 3])
tensor(4.6624, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.05522599071264267), (196607, 0.9923657774925232)]
([1, 1, 0], [1, 1, 3])
tensor(4.6519, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.06627119332551956), (196607, 1.1380698680877686)]
([1, 1, 0], [1, 1, 3])
tensor(4.65

 79%|███████▉  | 576/725 [19:19:40<7:51:34, 189.90s/it]

test：0.0, test mean: 0.2252032520325203
([5, 2, 5], [5, 2, 5])
tensor(5.0884, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.2942357659339905), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [5, 2, 5])
tensor(4.7390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.04937920719385147), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [5, 2, 5])
tensor(4.5958, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.025308161973953247), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [5, 2, 5])
tensor(4.5957, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.003072977066040039), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [5, 2, 5])
tensor(4.5956, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.017995640635490417), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [5, 2, 5])
tensor(4.5956, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.03845733404159546), (499999, 0.0), (499999, 0.0)]
([0, 2, 5], [5, 2, 5])
tensor(4.5955, device='cuda:0', grad_fn=<NllLossBackward0>)
[

 80%|███████▉  | 577/725 [19:24:12<8:49:11, 214.54s/it]

test：0.0, test mean: 0.22411003236245955
([2, 8, 2], [2, 8, 2])
tensor(11.0452, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|███████▉  | 578/725 [19:24:38<6:26:46, 157.87s/it]

([1, 1, 1], [1, 1, 1])
tensor(6.9623, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|███████▉  | 579/725 [19:25:03<4:46:53, 117.90s/it]

([1, 1, 2], [1, 1, 2])
tensor(6.0808, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|████████  | 580/725 [19:25:28<3:37:51, 90.15s/it] 

([2, 0, 7], [2, 0, 7])
tensor(4.5777, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.0010331821395084262)]


 80%|████████  | 581/725 [19:25:52<2:48:58, 70.41s/it]

([2, 2, 2], [2, 2, 2])
tensor(12.6164, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|████████  | 582/725 [19:26:18<2:15:30, 56.86s/it]

([7, 2, 2], [7, 2, 2])
tensor(14.1706, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.3479323387145996), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(14.5101, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.41384220123291016), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(12.6266, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9252610206604004), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(10.7896, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.114924430847168), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(10.4971, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.507622718811035), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(10.4443, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.388459205627441), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(10.4246, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.736676216125488), (499999, 0.0

 80%|████████  | 583/725 [19:30:49<4:46:52, 121.22s/it]

test：0.0, test mean: 0.22302737520128824
([0, 2, 8], [0, 2, 8])
tensor(4.5043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.02931644767522812), (499999, 0.0), (499999, -0.009668145328760147)]
([7, 2, 8], [0, 2, 8])
tensor(4.6593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 3.881989002227783), (499999, 0.0), (499999, -0.019336290657520294)]
([0, 2, 8], [0, 2, 8])
tensor(4.3569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.216254711151123), (499999, 0.0), (499999, -0.029004434123635292)]
([0, 2, 8], [0, 2, 8])
tensor(4.3468, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.3729939460754395), (499999, 0.0), (499999, -0.03867258131504059)]
([0, 2, 8], [0, 2, 8])
tensor(4.3410, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.465930938720703), (499999, 0.0), (499999, -0.048340726643800735)]
([0, 2, 8], [0, 2, 8])
tensor(4.3397, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.487601280212402), (499999, 0.0), (499999, -0.058008871972

 81%|████████  | 584/725 [19:35:13<6:25:40, 164.12s/it]

test：1.0, test mean: 0.2267628205128205
([6, 1, 0], [6, 1, 0])
tensor(2.0718, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 81%|████████  | 585/725 [19:35:38<4:45:37, 122.41s/it]

([2, 7, 2], [2, 7, 2])
tensor(12.9541, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.22122947871685028), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(13.0271, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -7.601773262023926), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(11.3222, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.111900806427002), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(9.7248, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.131803274154663), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(9.3429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.1507623195648193), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(9.2976, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.6725704669952393), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(9.2761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.280562162399292), (

 81%|████████  | 586/725 [19:40:09<6:26:55, 167.02s/it]

test：0.0, test mean: 0.2256778309409888
([8, 2, 1], [8, 2, 1])
tensor(8.9362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.019826114177703857), (499999, 0.0009657393675297499)]


 81%|████████  | 587/725 [19:40:35<4:46:26, 124.54s/it]

([7, 5, 0], [7, 0, 7])
tensor(3.5634, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.1730220913887024), (499999, 0.0), (499999, 0.0)]
([7, 5, 0], [7, 0, 7])
tensor(3.4964, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -14.563095092773438), (499999, 0.0), (499999, 0.0)]
([7, 5, 0], [7, 0, 7])
tensor(2.3387, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.797116279602051), (499999, 0.0), (499999, 0.0)]
([7, 5, 0], [7, 0, 7])
tensor(0.8779, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.011085033416748), (499999, 0.0), (499999, 0.0)]
([0, 5, 0], [7, 0, 7])
tensor(0.5690, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.889669418334961), (499999, 0.0), (499999, 0.0)]
([0, 5, 0], [7, 0, 7])
tensor(0.5494, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.441643714904785), (499999, 0.0), (499999, 0.0)]
([0, 5, 0], [7, 0, 7])
tensor(0.5410, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.123971462249756), (499999, 0.0), (49999

 81%|████████  | 588/725 [19:44:58<6:19:14, 166.09s/it]

test：0.0, test mean: 0.22460317460317458
([2, 3, 0], [2, 3, 0])
tensor(5.1259, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -0.0011006970889866352)]
([2, 3, 0], [2, 3, 0])
tensor(5.2749, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.7729542255401611)]
([2, 3, 0], [2, 3, 0])
tensor(5.1240, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.6561976671218872)]
([2, 3, 0], [2, 3, 0])
tensor(5.2750, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.4847004413604736)]
([2, 3, 0], [2, 3, 0])
tensor(5.1141, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.658518075942993)]
([2, 3, 0], [2, 3, 0])
tensor(5.1606, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.243328809738159)]
([2, 3, 0], [2, 3, 0])
tensor(5.1084, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 81%|████████  | 589/725 [19:49:52<7:43:19, 204.41s/it]

test：1.0, test mean: 0.2282780410742496
([2, 2, 8], [2, 2, 7])
tensor(6.6305, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 81%|████████▏ | 590/725 [19:50:16<5:38:17, 150.35s/it]

([7, 5, 1], [7, 3, 1])
tensor(4.9359, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.2099635899066925), (499999, 0.0), (499999, 0.0)]
([7, 5, 1], [7, 3, 1])
tensor(4.8890, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.272134780883789), (499999, 0.0), (499999, 0.0)]
([7, 5, 1], [7, 3, 1])
tensor(3.4015, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.665799140930176), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 3, 1])
tensor(2.3891, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.84274959564209), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 3, 1])
tensor(2.3797, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.023138999938965), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 3, 1])
tensor(2.3728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.117465972900391), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 3, 1])
tensor(2.3684, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.171010971069336), (499999, 0.0), (499999,

 82%|████████▏ | 591/725 [19:54:41<6:52:33, 184.73s/it]

test：0.0, test mean: 0.2272012578616352
([0, 7, 1], [0, 7, 1])
tensor(5.4949, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 0.0006022052839398384), (380927, 0.2102031111717224), (499999, 0.0)]
([7, 7, 1], [0, 7, 1])
tensor(5.6960, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 1.9957935810089111), (380927, -9.073112487792969), (499999, 0.0)]
([0, 7, 1], [0, 7, 1])
tensor(3.8003, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 2.877967357635498), (380927, -3.9178507328033447), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(2.5149, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 4.735424518585205), (380927, -2.537525177001953), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(2.2302, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 4.735366344451904), (380927, -2.5513434410095215), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(2.2073, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, 4.733010292053223), (380927, -2.7048559188842773), (499999, 0.0)]
([0,

 82%|████████▏ | 592/725 [19:59:19<7:51:46, 212.83s/it]

test：0.5, test mean: 0.22848200312989045
([2, 1, 7], [2, 1, 7])
tensor(8.3878, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 82%|████████▏ | 593/725 [19:59:45<5:44:38, 156.65s/it]

([0, 8, 1], [0, 8, 1])
tensor(4.1722, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.00012256624177098274), (499999, 0.0)]


 82%|████████▏ | 594/725 [20:00:10<4:15:45, 117.14s/it]

([1, 2, 0], [1, 2, 0])
tensor(7.8142, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, 0.055118221789598465)]
([1, 2, 0], [1, 2, 0])
tensor(7.8235, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -3.83837890625)]
([1, 2, 0], [1, 2, 0])
tensor(7.7922, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -4.253481864929199)]
([1, 2, 0], [1, 2, 0])
tensor(7.7481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -4.290114402770996)]
([1, 2, 0], [1, 2, 0])
tensor(7.7342, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -4.32004451751709)]
([1, 2, 0], [1, 2, 0])
tensor(7.7292, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -4.350639343261719)]
([1, 2, 0], [1, 2, 0])
tensor(7.7274, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -4.3944730758666

 82%|████████▏ | 595/725 [20:04:43<5:55:26, 164.05s/it]

test：1.0, test mean: 0.23208722741433022
([2, 1, 2], [2, 1, 2])
tensor(12.9507, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 82%|████████▏ | 596/725 [20:05:06<4:21:45, 121.75s/it]

([7, 7, 5], [0, 7, 5])
tensor(5.0128, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2102201282978058), (491519, -0.41884708404541016)]
([7, 7, 0], [0, 7, 5])
tensor(4.4850, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.168529510498047), (491519, 5.013069152832031)]
([7, 7, 0], [0, 7, 5])
tensor(2.4690, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.7538628578186035), (491519, 5.208263874053955)]
([7, 7, 0], [0, 7, 5])
tensor(0.9369, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.5820636749267578), (491519, 5.332974910736084)]
([7, 0, 0], [0, 7, 5])
tensor(0.5959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.4419666528701782), (491519, 5.427535057067871)]
([7, 0, 0], [0, 7, 5])
tensor(0.5429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.9067438840866089), (491519, 5.513854503631592)]
([7, 0, 0], [0, 7, 5])
tensor(0.5194, device=

 82%|████████▏ | 597/725 [20:09:48<6:02:04, 169.72s/it]

test：0.0, test mean: 0.23100775193798448
([2, 1, 7], [2, 1, 7])
tensor(7.3014, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 82%|████████▏ | 598/725 [20:10:14<4:27:44, 126.49s/it]

([1, 2, 1], [1, 2, 1])
tensor(8.4323, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 5.063745629740879e-05), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 599/725 [20:10:39<3:22:11, 96.28s/it] 

([0, 1, 2], [0, 1, 2])
tensor(4.5407, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 600/725 [20:11:06<2:36:57, 75.34s/it]

([1, 2, 2], [1, 2, 2])
tensor(13.1017, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 601/725 [20:11:29<2:03:25, 59.72s/it]

([6, 0, 1], [6, 0, 1])
tensor(4.4925, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 602/725 [20:11:52<1:39:52, 48.72s/it]

([2, 2, 2], [2, 2, 2])
tensor(11.3194, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 603/725 [20:12:16<1:24:03, 41.34s/it]

([2, 1, 1], [2, 1, 1])
tensor(8.4898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 604/725 [20:12:42<1:13:58, 36.68s/it]

([2, 6, 5], [2, 6, 5])
tensor(5.6737, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 605/725 [20:13:08<1:06:47, 33.39s/it]

([1, 1, 8], [1, 1, 8])
tensor(6.0063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▎ | 606/725 [20:13:33<1:01:07, 30.82s/it]

([1, 2, 2], [1, 2, 2])
tensor(9.9940, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▎ | 607/725 [20:14:00<58:27, 29.72s/it]  

([2, 1, 5], [2, 1, 5])
tensor(8.5164, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▍ | 608/725 [20:14:26<56:12, 28.83s/it]

([2, 2, 1], [2, 2, 1])
tensor(8.7329, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▍ | 609/725 [20:14:51<53:08, 27.49s/it]

([1, 1, 7], [1, 1, 7])
tensor(6.7138, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.21020719408988953)]
([1, 1, 7], [1, 1, 7])
tensor(6.5965, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -9.07332992553711)]
([1, 1, 7], [1, 1, 7])
tensor(5.0352, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.780453681945801)]
([1, 1, 0], [1, 1, 7])
tensor(3.5037, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.382702827453613)]
([1, 1, 0], [1, 1, 7])
tensor(3.4376, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.239556312561035)]
([1, 1, 0], [1, 1, 7])
tensor(3.4326, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.185174465179443)]
([1, 1, 0], [1, 1, 7])
tensor(3.4312, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.1474905014

 84%|████████▍ | 610/725 [20:19:45<3:25:53, 107.42s/it]

test：0.0, test mean: 0.22993827160493827
([1, 1, 7], [1, 1, 7])
tensor(4.3776, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▍ | 611/725 [20:20:10<2:37:29, 82.89s/it] 

([1, 1, 1], [1, 1, 1])
tensor(5.5583, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▍ | 612/725 [20:20:36<2:03:35, 65.63s/it]

([3, 1, 2], [3, 1, 2])
tensor(6.5375, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 4.2584873881423846e-05)]


 85%|████████▍ | 613/725 [20:21:02<1:40:19, 53.75s/it]

([6, 2, 0], [6, 2, 0])
tensor(6.0617, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0013593960320577025), (499999, 0.0)]


 85%|████████▍ | 614/725 [20:21:27<1:23:45, 45.28s/it]

([2, 0, 2], [2, 0, 2])
tensor(8.1756, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -0.011706678196787834), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(8.3452, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -0.1467059850692749), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(8.1752, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -0.22169890999794006), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(8.3101, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.10431656241416931), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(8.1607, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.08635658025741577), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(8.1581, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.02800649404525757), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(8.1545, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -0.0128896832466125

 85%|████████▍ | 615/725 [20:25:58<3:26:42, 112.75s/it]

test：1.0, test mean: 0.2334869431643625
([5, 2, 0], [5, 2, 7])
tensor(5.7360, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 85%|████████▍ | 616/725 [20:26:23<2:37:01, 86.43s/it] 

([3, 5, 1], [7, 5, 1])
tensor(3.4599, device='cuda:0', grad_fn=<NllLossBackward0>)
[(459263, -0.5613795518875122), (499999, 0.0), (499999, 0.0)]
([3, 5, 1], [7, 5, 1])
tensor(3.4625, device='cuda:0', grad_fn=<NllLossBackward0>)
[(459263, -0.8140814304351807), (499999, 0.0), (499999, 0.0)]
([3, 5, 1], [7, 5, 1])
tensor(3.4534, device='cuda:0', grad_fn=<NllLossBackward0>)
[(459263, -1.1595778465270996), (499999, 0.0), (499999, 0.0)]
([3, 5, 1], [7, 5, 1])
tensor(3.3929, device='cuda:0', grad_fn=<NllLossBackward0>)
[(459263, -1.8940844535827637), (499999, 0.0), (499999, 0.0)]
([3, 5, 1], [7, 5, 1])
tensor(3.3752, device='cuda:0', grad_fn=<NllLossBackward0>)
[(459263, -3.089233636856079), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(2.8971, device='cuda:0', grad_fn=<NllLossBackward0>)
[(459263, -2.4148120880126953), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(2.8548, device='cuda:0', grad_fn=<NllLossBackward0>)
[(459263, -2.006542205810547), (499999, 0.0), (4

 85%|████████▌ | 617/725 [20:30:54<4:15:40, 142.04s/it]

test：0.0, test mean: 0.23241590214067276
([8, 3, 7], [8, 3, 7])
tensor(5.1356, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 85%|████████▌ | 618/725 [20:31:20<3:11:03, 107.14s/it]

([2, 0, 6], [2, 0, 6])
tensor(3.6974, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 85%|████████▌ | 619/725 [20:31:47<2:26:35, 82.97s/it] 

([2, 1, 0], [2, 1, 0])
tensor(7.4012, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▌ | 620/725 [20:32:10<1:53:59, 65.14s/it]

([2, 0, 7], [2, 0, 7])
tensor(5.5769, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.22091397643089294)]
([2, 0, 7], [2, 0, 7])
tensor(5.5907, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -6.109986305236816)]
([2, 0, 7], [2, 0, 7])
tensor(3.9966, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -3.4711074829101562)]
([2, 0, 0], [2, 0, 7])
tensor(3.1577, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -3.628679037094116)]
([2, 0, 0], [2, 0, 7])
tensor(3.1564, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -3.7552902698516846)]
([2, 0, 0], [2, 0, 7])
tensor(3.1544, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -3.8675081729888916)]
([2, 0, 0], [2, 0, 7])
tensor(3.1531, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -3.947026

 86%|████████▌ | 621/725 [20:37:10<3:54:57, 135.55s/it]

test：0.0, test mean: 0.2313546423135464
([5, 2, 0], [5, 2, 0])
tensor(5.5739, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▌ | 622/725 [20:37:35<2:55:37, 102.31s/it]

([0, 1, 2], [7, 1, 2])
tensor(6.2852, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▌ | 623/725 [20:37:59<2:14:22, 79.05s/it] 

([8, 1, 2], [8, 1, 2])
tensor(9.1122, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▌ | 624/725 [20:38:23<1:45:07, 62.45s/it]

([2, 8, 1], [2, 8, 1])
tensor(5.1879, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 0.5002841353416443), (499999, 0.0)]
([2, 8, 1], [2, 8, 1])
tensor(5.1375, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 2.965531349182129), (499999, 0.0)]
([2, 0, 1], [2, 8, 1])
tensor(4.4093, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 2.9270410537719727), (499999, 0.0)]
([2, 0, 1], [2, 8, 1])
tensor(4.4005, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 2.9181222915649414), (499999, 0.0)]
([2, 0, 1], [2, 8, 1])
tensor(4.3986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 2.9120469093322754), (499999, 0.0)]
([2, 0, 1], [2, 8, 1])
tensor(4.3979, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 2.910970449447632), (499999, 0.0)]
([2, 0, 1], [2, 8, 1])
tensor(4.3970, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 2.9128341674804688), (499999, 

 86%|████████▌ | 625/725 [20:42:47<3:24:41, 122.82s/it]

test：0.0, test mean: 0.2303030303030303
([7, 1, 5], [7, 1, 6])
tensor(6.5280, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.22122922539710999), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 6])
tensor(6.5900, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.601762294769287), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 6])
tensor(4.8865, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.1043262481689453), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 6])
tensor(3.3032, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.7169275283813477), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 6])
tensor(2.9517, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.819959878921509), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 6])
tensor(2.8932, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.759794235229492), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 6])
tensor(2.8733, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 

 86%|████████▋ | 626/725 [20:47:17<4:35:31, 166.99s/it]

test：0.0, test mean: 0.22926093514328807
([8, 8, 5], [8, 8, 5])
tensor(5.7250, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▋ | 627/725 [20:47:42<3:23:24, 124.54s/it]

([0, 1, 1], [0, 1, 1])
tensor(4.2062, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 87%|████████▋ | 628/725 [20:48:07<2:32:55, 94.59s/it] 

([1, 1, 3], [1, 1, 3])
tensor(4.9365, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.001704805064946413), (499999, 0.0), (466943, -0.5786563158035278)]
([1, 1, 3], [1, 1, 3])
tensor(4.9398, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.003409610129892826), (499999, 0.0), (466943, -2.8540782928466797)]
([1, 1, 7], [1, 1, 3])
tensor(4.2526, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00511441333219409), (499999, 0.0), (466943, -3.13746976852417)]
([1, 1, 1], [1, 1, 3])
tensor(4.1364, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.006819220259785652), (499999, 0.0), (466943, -0.1527186930179596)]
([1, 1, 0], [1, 1, 3])
tensor(3.8561, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.008524024859070778), (499999, 0.0), (466943, 0.7315129041671753)]
([1, 1, 0], [1, 1, 3])
tensor(3.8294, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.010228827595710754), (499999, 0.0), (466943, 1.117597222328186)]
([1, 1, 0], [1, 1, 3])
tenso

 87%|████████▋ | 629/725 [20:52:45<3:59:17, 149.56s/it]

test：0.0, test mean: 0.22822822822822822
([2, 2, 2], [2, 2, 2])
tensor(13.8074, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.010222088545560837), (499999, 0.0)]


 87%|████████▋ | 630/725 [20:53:11<2:58:02, 112.44s/it]

([2, 5, 2], [2, 5, 2])
tensor(9.4465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 87%|████████▋ | 631/725 [20:53:36<2:14:56, 86.13s/it] 

([6, 2, 6], [6, 2, 6])
tensor(5.8483, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 87%|████████▋ | 632/725 [20:54:00<1:44:49, 67.63s/it]

([1, 1, 2], [1, 1, 2])
tensor(8.1695, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.001454262062907219), (499999, 0.0)]


 87%|████████▋ | 633/725 [20:54:24<1:23:31, 54.48s/it]

([1, 6, 2], [0, 6, 2])
tensor(6.2958, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 87%|████████▋ | 634/725 [20:54:49<1:09:20, 45.72s/it]

([0, 2, 1], [0, 2, 1])
tensor(5.3104, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 88%|████████▊ | 635/725 [20:55:15<59:45, 39.84s/it]  

([2, 8, 8], [2, 7, 8])
tensor(5.5020, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.0007470670389011502)]


 88%|████████▊ | 636/725 [20:55:40<52:35, 35.45s/it]

([1, 1, 3], [1, 1, 3])
tensor(5.8418, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 88%|████████▊ | 637/725 [20:56:05<47:14, 32.21s/it]

([5, 2, 2], [5, 2, 2])
tensor(9.2862, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 88%|████████▊ | 638/725 [20:56:31<43:59, 30.34s/it]

([0, 8, 2], [0, 8, 2])
tensor(6.6555, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 8.134311792673543e-05), (499999, 0.0), (499999, 0.0)]


 88%|████████▊ | 639/725 [20:56:56<41:09, 28.72s/it]

([2, 1, 7], [2, 1, 7])
tensor(6.6314, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.20867660641670227)]
([2, 1, 7], [2, 1, 7])
tensor(6.7440, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -12.586380958557129)]
([2, 1, 7], [2, 1, 7])
tensor(5.2451, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -7.591606140136719)]
([2, 1, 0], [2, 1, 7])
tensor(4.5378, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -7.77672004699707)]
([2, 1, 0], [2, 1, 7])
tensor(4.5333, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -7.899130821228027)]
([2, 1, 0], [2, 1, 7])
tensor(4.5316, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -8.011922836303711)]
([2, 1, 0], [2, 1, 7])
tensor(4.5296, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -8.088662147

 88%|████████▊ | 640/725 [21:01:52<2:34:07, 108.80s/it]

test：0.0, test mean: 0.22720478325859492
([0, 7, 6], [0, 7, 6])
tensor(4.8092, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.2102210968732834), (499999, 0.0)]
([0, 7, 6], [0, 7, 6])
tensor(4.6111, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -7.781694412231445), (499999, 0.0)]
([0, 7, 6], [0, 7, 6])
tensor(3.0072, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.403255820274353), (499999, 0.0)]
([0, 7, 6], [0, 7, 6])
tensor(1.3524, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.8102192878723145), (499999, 0.0)]
([0, 0, 6], [0, 7, 6])
tensor(1.0615, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.0305986404418945), (499999, 0.0)]
([0, 0, 6], [0, 7, 6])
tensor(0.9918, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 3.615687847137451), (499999, 0.0)]
([0, 0, 6], [0, 7, 6])
tensor(0.9692, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0

 88%|████████▊ | 641/725 [21:06:22<3:40:19, 157.38s/it]

test：0.0, test mean: 0.2261904761904762
([0, 7, 8], [0, 7, 8])
tensor(3.1647, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.0837872251868248), (499999, 0.0), (499999, 0.0)]
([7, 7, 8], [0, 7, 8])
tensor(3.3935, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -1.5934655666351318), (499999, 0.0), (499999, 0.0)]
([0, 7, 8], [0, 7, 8])
tensor(3.0011, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.4650741517543793), (499999, 0.0), (499999, 0.0)]
([0, 7, 8], [0, 7, 8])
tensor(2.9853, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.27956026792526245), (499999, 0.0), (499999, 0.0)]
([0, 7, 8], [0, 7, 8])
tensor(2.9838, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.12432411313056946), (499999, 0.0), (499999, 0.0)]
([0, 7, 8], [0, 7, 8])
tensor(2.9827, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.007304459810256958), (499999, 0.0), (499999, 0.0)]
([0, 7, 8], [0, 7, 8])
tensor(2.9819, device='cuda:0', grad_fn=<NllLossBackward0>)
[(47

 89%|████████▊ | 642/725 [21:10:50<4:23:26, 190.44s/it]

test：1.0, test mean: 0.22962962962962963
([2, 1, 2], [2, 1, 2])
tensor(10.5858, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 89%|████████▊ | 643/725 [21:11:15<3:12:17, 140.70s/it]

([0, 1, 8], [7, 1, 8])
tensor(4.5089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.000735217472538352)]


 89%|████████▉ | 644/725 [21:11:41<2:23:48, 106.52s/it]

([7, 0, 2], [0, 0, 2])
tensor(3.9498, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.006447734776884317), (499999, 0.0)]
([7, 0, 2], [0, 0, 2])
tensor(4.2058, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.8067240715026855), (499999, 0.0)]
([7, 0, 2], [0, 0, 2])
tensor(3.9429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -2.0056746006011963), (499999, 0.0)]
([7, 0, 2], [0, 0, 2])
tensor(4.0379, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.8578366041183472), (499999, 0.0)]
([7, 0, 2], [0, 0, 2])
tensor(3.9189, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.8876371383666992), (499999, 0.0)]
([7, 0, 2], [0, 0, 2])
tensor(3.9245, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.7016724348068237), (499999, 0.0)]
([7, 0, 2], [0, 0, 2])
tensor(3.9161, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.7585963010787964)

 89%|████████▉ | 645/725 [21:16:16<3:29:18, 156.99s/it]

test：1.0, test mean: 0.23303834808259585
([5, 7, 1], [5, 7, 1])
tensor(6.8198, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.3589409589767456), (499999, 0.0)]
([5, 7, 1], [5, 7, 1])
tensor(7.1614, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.3430620431900024), (499999, 0.0)]
([5, 7, 1], [5, 7, 1])
tensor(5.3102, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.550185203552246), (499999, 0.0)]
([5, 7, 1], [5, 7, 1])
tensor(3.4362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.5213637351989746), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.1849, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 3.750844717025757), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.1421, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 3.376901626586914), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.1233, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0

 89%|████████▉ | 646/725 [21:20:47<4:11:34, 191.07s/it]

test：0.0, test mean: 0.23201174743024963
([0, 0, 5], [0, 0, 8])
tensor(1.0377, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -1.7125717022281606e-06), (499999, 0.0)]


 89%|████████▉ | 647/725 [21:21:12<3:03:55, 141.48s/it]

([2, 0, 0], [2, 0, 0])
tensor(3.9865, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.003292026463896036), (380927, 0.02833889052271843)]
([2, 0, 7], [2, 0, 0])
tensor(4.2833, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.5360784530639648), (380927, 4.181034088134766)]
([2, 0, 0], [2, 0, 0])
tensor(3.9396, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.5437597036361694), (380927, 6.529763221740723)]
([2, 0, 0], [2, 0, 0])
tensor(4.1246, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.8885402083396912), (380927, 10.84521198272705)]
([2, 0, 0], [2, 0, 0])
tensor(3.8876, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.8826223611831665), (380927, 10.897589683532715)]
([2, 0, 0], [2, 0, 0])
tensor(3.8861, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.899789571762085), (380927, 10.943286895751953)]
([2, 0, 0], [2, 0, 0])
tensor(3.8853, dev

 89%|████████▉ | 648/725 [21:26:06<4:00:03, 187.06s/it]

test：1.0, test mean: 0.23538011695906433
([8, 2, 1], [4, 2, 1])
tensor(9.1748, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 90%|████████▉ | 649/725 [21:26:31<2:55:23, 138.47s/it]

([5, 8, 1], [5, 8, 1])
tensor(4.9765, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0027420492842793465)]


 90%|████████▉ | 650/725 [21:26:56<2:10:38, 104.51s/it]

([3, 7, 5], [3, 7, 5])
tensor(4.2707, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 90%|████████▉ | 651/725 [21:27:23<1:40:03, 81.13s/it] 

([1, 5, 2], [1, 5, 2])
tensor(4.9590, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -0.47405779361724854), (499999, 0.0), (499999, 0.0)]
([1, 5, 2], [1, 5, 2])
tensor(4.9513, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -1.8140878677368164), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [1, 5, 2])
tensor(4.7608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -2.494497299194336), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [1, 5, 2])
tensor(4.7445, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -3.189373731613159), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [1, 5, 2])
tensor(4.6833, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -3.4085330963134766), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [1, 5, 2])
tensor(4.6757, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -3.8943705558776855), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [1, 5, 2])
tensor(4.6714, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -4.3262038230896), (499999, 0.0), (499

 90%|████████▉ | 652/725 [21:31:49<2:46:06, 136.53s/it]

test：0.0, test mean: 0.2343522561863173
([5, 7, 1], [5, 7, 1])
tensor(6.5231, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21020781993865967), (499999, 0.0)]
([5, 7, 1], [5, 7, 1])
tensor(6.4104, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.073381423950195), (499999, 0.0)]
([5, 7, 1], [5, 7, 1])
tensor(4.8161, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.7822418212890625), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.2932, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.1572957038879395), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.2347, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.052677631378174), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.2321, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.922600269317627), (499999, 0.0)]
([5, 0, 1], [5, 7, 1])
tensor(3.2282, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 90%|█████████ | 653/725 [21:36:26<3:34:39, 178.89s/it]

test：0.0, test mean: 0.2333333333333333
([2, 2, 1], [2, 2, 1])
tensor(8.6637, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0002595843398012221)]


 90%|█████████ | 654/725 [21:36:51<2:37:04, 132.74s/it]

([2, 3, 1], [2, 3, 1])
tensor(6.3754, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -0.811796247959137), (499999, 0.0)]
([2, 3, 1], [2, 3, 1])
tensor(6.3321, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -0.883939266204834), (499999, 0.0)]
([2, 0, 1], [2, 3, 1])
tensor(5.1800, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.6206924915313721), (499999, 0.0)]
([2, 0, 1], [2, 3, 1])
tensor(5.0033, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 1.615091323852539), (499999, 0.0)]
([2, 0, 1], [2, 3, 1])
tensor(4.9246, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 1.8398151397705078), (499999, 0.0)]
([2, 0, 1], [2, 3, 1])
tensor(4.9015, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 2.1637375354766846), (499999, 0.0)]
([2, 0, 1], [2, 3, 1])
tensor(4.8889, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 2.40462327003479), (499999, 0

 90%|█████████ | 655/725 [21:41:20<3:22:32, 173.61s/it]

test：0.0, test mean: 0.23232323232323232
([7, 0, 0], [7, 1, 0])
tensor(3.4398, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.2101917862892151), (499999, 0.0), (499999, 0.0)]
([7, 0, 0], [7, 1, 0])
tensor(3.4288, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.092033386230469), (499999, 0.0), (499999, 0.0)]
([7, 0, 0], [7, 1, 0])
tensor(1.8701, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.901569843292236), (499999, 0.0), (499999, 0.0)]
([0, 0, 0], [7, 1, 0])
tensor(0.5080, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.720217704772949), (499999, 0.0), (499999, 0.0)]
([0, 0, 0], [7, 1, 0])
tensor(0.4818, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.922114849090576), (499999, 0.0), (499999, 0.0)]
([0, 0, 0], [7, 1, 0])
tensor(0.4756, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.064640045166016), (499999, 0.0), (499999, 0.0)]
([0, 0, 0], [7, 1, 0])
tensor(0.4724, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5

 90%|█████████ | 656/725 [21:45:48<3:52:06, 201.83s/it]

test：0.0, test mean: 0.23132183908045975
([8, 2, 2], [8, 2, 2])
tensor(9.4720, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 91%|█████████ | 657/725 [21:46:12<2:48:11, 148.41s/it]

([0, 1, 2], [0, 1, 2])
tensor(6.0599, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.011282885447144508), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(6.1269, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.287601113319397), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(6.0592, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.3332325220108032), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(6.1219, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.49927717447280884), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(6.0607, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.4943821430206299), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(6.2461, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.2533589601516724), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(6.0566, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.2454265356063843), (499999, 0.0), 

 91%|█████████ | 658/725 [21:50:43<3:26:55, 185.31s/it]

test：1.0, test mean: 0.234620886981402
([7, 0, 0], [7, 0, 0])
tensor(3.8606, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.3589417636394501), (442367, 0.12569290399551392), (466943, 0.002788878045976162)]
([7, 7, 1], [7, 0, 0])
tensor(4.8629, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -3.380094289779663), (442367, 3.6421713829040527), (466943, -6.852635383605957)]
([7, 0, 0], [7, 0, 0])
tensor(2.2107, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.766313076019287), (442367, 4.033941268920898), (466943, -6.865707874298096)]
([7, 7, 1], [7, 0, 0])
tensor(1.5422, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.422362327575684), (442367, 1.241539716720581), (466943, -13.721132278442383)]
([0, 0, 0], [7, 0, 0])
tensor(0.1334, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.1475934982299805), (442367, 1.343736171722412), (466943, -13.734199523925781)]
([0, 0, 1], [7, 0, 0])
tensor(0.4565, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380

 91%|█████████ | 659/725 [21:55:33<3:58:26, 216.76s/it]

test：0.3333333333333333, test mean: 0.23504273504273504
([8, 8, 2], [8, 8, 2])
tensor(5.7800, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 91%|█████████ | 660/725 [21:56:01<2:53:22, 160.05s/it]

([0, 1, 8], [0, 1, 8])
tensor(4.1958, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.07138003408908844), (499999, 0.0), (499999, 0.0)]
([7, 1, 8], [0, 1, 8])
tensor(4.4475, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.14203178882598877), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(4.1879, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.5798317193984985), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(4.3834, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.47297418117523193), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(4.1857, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.5106446743011475), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(4.3056, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -5.808619976043701), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(4.1547, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -5.836943626403809), (499999, 0.0),

 91%|█████████ | 661/725 [22:00:26<3:24:24, 191.64s/it]

test：1.0, test mean: 0.23829787234042554
([2, 1, 0], [2, 1, 0])
tensor(6.4980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 91%|█████████▏| 662/725 [22:00:52<2:28:53, 141.80s/it]

([2, 8, 2], [2, 8, 2])
tensor(9.5424, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 0.2588631510734558), (499999, 0.0)]
([2, 8, 2], [2, 8, 2])
tensor(9.7303, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 6.611492156982422), (499999, 0.0)]
([2, 0, 2], [2, 8, 2])
tensor(8.9449, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 7.848262786865234), (499999, 0.0)]
([2, 0, 2], [2, 8, 2])
tensor(8.8812, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 8.546587944030762), (499999, 0.0)]
([2, 0, 2], [2, 8, 2])
tensor(8.8747, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 9.079656600952148), (499999, 0.0)]
([2, 0, 2], [2, 8, 2])
tensor(8.8717, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 9.344525337219238), (499999, 0.0)]
([2, 0, 2], [2, 8, 2])
tensor(8.8707, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 9.581850051879883), (499999, 0.0)

 91%|█████████▏| 663/725 [22:05:19<3:05:17, 179.31s/it]

test：0.0, test mean: 0.23728813559322035
([1, 1, 2], [1, 1, 2])
tensor(7.6324, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 92%|█████████▏| 664/725 [22:05:44<2:15:26, 133.22s/it]

([2, 2, 1], [2, 2, 1])
tensor(11.4115, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.005439170636236668)]


 92%|█████████▏| 665/725 [22:06:10<1:40:56, 100.94s/it]

([0, 6, 2], [0, 6, 2])
tensor(4.8758, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.0469915047287941), (499999, 0.0), (499999, 0.0)]
([0, 6, 2], [0, 6, 2])
tensor(5.0654, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 1.942799687385559), (499999, 0.0), (499999, 0.0)]
([0, 6, 2], [0, 6, 2])
tensor(4.8710, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 1.5524215698242188), (499999, 0.0), (499999, 0.0)]
([0, 6, 2], [0, 6, 2])
tensor(4.9888, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.02969716489315033), (499999, 0.0), (499999, 0.0)]
([0, 6, 2], [0, 6, 2])
tensor(4.8584, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.1317213624715805), (499999, 0.0), (499999, 0.0)]
([0, 6, 2], [0, 6, 2])
tensor(4.9044, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -1.4517600536346436), (499999, 0.0), (499999, 0.0)]
([0, 6, 2], [0, 6, 2])
tensor(4.8553, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -1.5540761947631836), (499999, 0.0), (4

 92%|█████████▏| 666/725 [22:10:35<2:27:33, 150.05s/it]

test：1.0, test mean: 0.24050632911392406
([1, 1, 2], [1, 1, 2])
tensor(8.1440, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 92%|█████████▏| 667/725 [22:11:00<1:48:44, 112.50s/it]

([0, 2, 2], [0, 2, 2])
tensor(6.8305, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 92%|█████████▏| 668/725 [22:11:26<1:22:12, 86.54s/it] 

([2, 7, 2], [2, 7, 2])
tensor(12.3917, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21022027730941772), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(12.1994, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.273931503295898), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(10.5015, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.236560344696045), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(8.9629, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.5830574035644531), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(8.6659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.952273368835449), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(8.6020, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.553874969482422), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(8.5839, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.301481008529663), (4

 92%|█████████▏| 669/725 [22:15:52<2:11:16, 140.64s/it]

test：0.0, test mean: 0.23949579831932774
([2, 1, 0], [2, 1, 0])
tensor(6.1263, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.001359368092380464), (499999, 0.0), (499999, -0.006751439534127712)]


 92%|█████████▏| 670/725 [22:16:18<1:37:10, 106.01s/it]

([8, 1, 7], [0, 1, 7])
tensor(5.4552, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 0.20707261562347412)]
([8, 1, 7], [0, 1, 7])
tensor(5.3466, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -11.375467300415039)]
([8, 1, 7], [0, 1, 7])
tensor(3.8998, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -6.133681297302246)]
([8, 1, 0], [0, 1, 7])
tensor(3.2793, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -6.187016010284424)]
([8, 1, 0], [0, 1, 7])
tensor(3.2781, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -6.234735012054443)]
([8, 1, 0], [0, 1, 7])
tensor(3.2770, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -6.268684387207031)]
([8, 1, 0], [0, 1, 7])
tensor(3.2766, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -6.29962539

 93%|█████████▎| 671/725 [22:21:08<2:25:09, 161.28s/it]

test：0.0, test mean: 0.2384937238493724
([5, 2, 6], [8, 2, 6])
tensor(7.9139, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 93%|█████████▎| 672/725 [22:21:32<1:46:04, 120.08s/it]

([2, 1, 2], [2, 1, 2])
tensor(8.7487, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 93%|█████████▎| 673/725 [22:21:57<1:19:25, 91.65s/it] 

([5, 0, 0], [2, 0, 0])
tensor(1.5396, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.015547361224889755)]
([5, 0, 0], [2, 0, 0])
tensor(1.6383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.02353343367576599)]
([5, 0, 0], [2, 0, 0])
tensor(1.5012, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.13421574234962463)]
([5, 0, 0], [2, 0, 0])
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.18424707651138306)]
([5, 0, 0], [2, 0, 0])
tensor(1.4972, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.21702513098716736)]
([5, 0, 0], [2, 0, 0])
tensor(1.4969, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.24703983962535858)]
([5, 0, 0], [2, 0, 0])
tensor(1.4966, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.2741

 93%|█████████▎| 674/725 [22:26:42<2:07:08, 149.57s/it]

test：1.0, test mean: 0.24166666666666667
([1, 0, 7], [1, 0, 7])
tensor(4.3358, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.01929525099694729), (368639, -0.3476482331752777)]
([1, 0, 7], [1, 0, 7])
tensor(4.8458, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.03859050199389458), (368639, -1.451744556427002)]
([1, 0, 7], [1, 0, 7])
tensor(3.5894, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.05788575857877731), (368639, 1.2548186779022217)]
([1, 0, 0], [1, 0, 7])
tensor(2.1490, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.07718100398778915), (368639, 1.0704782009124756)]
([1, 0, 0], [1, 0, 7])
tensor(2.1480, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.09647626429796219), (368639, 0.9196102619171143)]
([1, 0, 0], [1, 0, 7])
tensor(2.1468, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.11577151715755463), (368639, 0.78334295749

 93%|█████████▎| 675/725 [22:31:48<2:43:41, 196.43s/it]

test：0.0, test mean: 0.24066390041493776
([1, 0, 0], [1, 0, 0])
tensor(2.6183, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.41042211651802063), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(2.6253, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.5838416814804077), (499999, 0.0)]
([1, 3, 0], [1, 0, 0])
tensor(2.6258, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.7375581860542297), (499999, 0.0)]
([1, 3, 0], [1, 0, 0])
tensor(2.5788, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -1.234015703201294), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(2.5344, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -1.6557817459106445), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(2.3266, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -1.2617106437683105), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(2.3049, device='cuda:0', grad_fn=<NllLossBackward0>)
[(4999

 93%|█████████▎| 676/725 [22:36:42<3:04:23, 225.78s/it]

test：1.0, test mean: 0.24380165289256198
([0, 2, 2], [0, 2, 2])
tensor(7.2557, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 93%|█████████▎| 677/725 [22:37:08<2:12:45, 165.96s/it]

([1, 2, 1], [1, 2, 1])
tensor(8.2541, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.006280192639678717), (499999, 0.0), (499999, 0.0)]


 94%|█████████▎| 678/725 [22:37:36<1:37:26, 124.40s/it]

([7, 1, 5], [7, 1, 5])
tensor(7.6648, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21021994948387146), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 5])
tensor(7.7518, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.797616958618164), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 5])
tensor(6.1097, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.082640171051025), (499999, 0.0), (499999, 0.0)]
([7, 1, 5], [7, 1, 5])
tensor(4.5448, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.662291407585144), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(4.1053, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.3381530046463013), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(4.0527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.646578311920166), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [7, 1, 5])
tensor(4.0307, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.3514132499694824), (499999, 0.0), (499

 94%|█████████▎| 679/725 [22:42:07<2:09:03, 168.35s/it]

test：0.0, test mean: 0.24279835390946503
([8, 0, 0], [8, 0, 5])
tensor(0.9336, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.06481572985649109)]
([8, 0, 0], [8, 0, 5])
tensor(0.8113, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -7.216795921325684)]
([8, 0, 0], [8, 0, 5])
tensor(0.6639, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -7.202028274536133)]
([8, 0, 0], [8, 0, 5])
tensor(0.6512, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -7.195116996765137)]
([8, 0, 0], [8, 0, 5])
tensor(0.6414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -7.277657508850098)]
([8, 0, 0], [8, 0, 5])
tensor(0.6380, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -7.33799409866333)]
([8, 0, 0], [8, 0, 5])
tensor(0.6362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0

 94%|█████████▍| 680/725 [22:46:43<2:30:37, 200.84s/it]

test：0.0, test mean: 0.24180327868852458
([5, 2, 1], [5, 2, 1])
tensor(7.1792, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.005744783207774162)]


 94%|█████████▍| 681/725 [22:47:09<1:48:39, 148.16s/it]

([1, 0, 2], [1, 0, 2])
tensor(6.8092, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 94%|█████████▍| 682/725 [22:47:34<1:19:44, 111.26s/it]

([2, 3, 1], [2, 3, 1])
tensor(8.2356, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.001930277794599533)]


 94%|█████████▍| 683/725 [22:48:00<1:00:07, 85.89s/it] 

([5, 2, 7], [5, 2, 7])
tensor(6.0462, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 94%|█████████▍| 684/725 [22:48:29<47:01, 68.83s/it]  

([0, 0, 2], [0, 0, 2])
tensor(4.4852, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 94%|█████████▍| 685/725 [22:48:54<37:07, 55.68s/it]

([3, 0, 2], [3, 0, 2])
tensor(3.9900, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 0.1331264227628708), (442367, -0.012891840189695358), (499999, 0.0)]
([3, 0, 2], [3, 0, 2])
tensor(3.7676, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 6.915753364562988), (442367, -1.9199844598770142), (499999, 0.0)]
([3, 0, 2], [3, 0, 2])
tensor(3.4889, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 17.10641860961914), (442367, -2.0079362392425537), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(3.0531, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 16.393362045288086), (442367, -3.4641454219818115), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(2.9096, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 16.0592098236084), (442367, -3.515571355819702), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(2.9713, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 15.855982780456543), (442367, -6.273833274841309), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(2.9037, device

 95%|█████████▍| 686/725 [22:53:30<1:18:59, 121.54s/it]

test：0.5, test mean: 0.24285714285714285
([1, 3, 6], [1, 3, 6])
tensor(4.3599, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 95%|█████████▍| 687/725 [22:53:57<59:05, 93.31s/it]   

([0, 2, 1], [0, 2, 1])
tensor(4.8910, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.028150364756584167), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [0, 2, 1])
tensor(5.0156, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 3.389770030975342), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.7338, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 3.735710382461548), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.7281, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 3.924464225769043), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.7255, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.052553176879883), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.7241, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.14918851852417), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [0, 2, 1])
tensor(4.7234, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.227806091308594), (499999, 0.0), (499999, 0.0

 95%|█████████▍| 688/725 [22:58:41<1:32:49, 150.51s/it]

test：1.0, test mean: 0.2459349593495935
([1, 1, 0], [1, 1, 0])
tensor(5.3892, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 95%|█████████▌| 689/725 [22:59:10<1:08:22, 113.97s/it]

([1, 1, 3], [1, 1, 3])
tensor(3.4602, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 95%|█████████▌| 690/725 [22:59:39<51:37, 88.50s/it]   

([2, 5, 2], [2, 8, 2])
tensor(10.1439, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 95%|█████████▌| 691/725 [23:00:06<39:39, 69.99s/it]

([8, 0, 2], [8, 0, 2])
tensor(4.9609, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 95%|█████████▌| 692/725 [23:00:33<31:27, 57.19s/it]

([2, 5, 2], [2, 5, 2])
tensor(12.4792, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 96%|█████████▌| 693/725 [23:01:01<25:46, 48.34s/it]

([1, 7, 6], [1, 7, 6])
tensor(5.7747, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 0.20691217482089996), (499999, 0.0)]
([1, 7, 6], [1, 7, 6])
tensor(5.6634, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -12.572078704833984), (499999, 0.0)]
([1, 7, 6], [1, 7, 6])
tensor(4.1438, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -9.949409484863281), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(3.5423, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -10.064691543579102), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(3.5411, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -10.15567684173584), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(3.5406, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -10.241887092590332), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(3.5403, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -10.319658279418945), (4

 96%|█████████▌| 694/725 [23:05:49<1:02:09, 120.31s/it]

test：0.0, test mean: 0.24493927125506074
([3, 7, 1], [3, 7, 1])
tensor(5.4942, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21022126078605652), (454655, -0.49967771768569946)]
([3, 7, 1], [3, 7, 1])
tensor(5.5174, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.797657012939453), (454655, -0.7375325560569763)]
([3, 7, 1], [3, 7, 1])
tensor(3.9383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.083893775939941), (454655, -0.9633521437644958)]
([3, 7, 1], [3, 7, 1])
tensor(2.3274, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.7621697187423706), (454655, -1.181753158569336)]
([3, 0, 1], [3, 7, 1])
tensor(1.8624, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.740782618522644), (454655, -1.4012843370437622)]
([3, 0, 1], [3, 7, 1])
tensor(1.7734, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.9403257966041565), (454655, -1.6178747415542603

 96%|█████████▌| 695/725 [23:10:41<1:25:58, 171.97s/it]

test：0.0, test mean: 0.2439516129032258
([2, 3, 1], [2, 3, 1])
tensor(9.2150, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 3.579253097996116e-05), (499999, 0.0), (499999, 0.0)]


 96%|█████████▌| 696/725 [23:11:07<1:01:52, 128.01s/it]

([5, 5, 1], [5, 5, 1])
tensor(4.3359, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 96%|█████████▌| 697/725 [23:11:33<45:27, 97.41s/it]   

([1, 2, 1], [1, 2, 1])
tensor(8.0666, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0009255248587578535)]


 96%|█████████▋| 698/725 [23:12:00<34:18, 76.24s/it]

([0, 1, 2], [0, 1, 2])
tensor(5.5913, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 96%|█████████▋| 699/725 [23:12:27<26:39, 61.53s/it]

([7, 0, 2], [7, 5, 2])
tensor(7.9983, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.20994044840335846), (196607, -0.26641151309013367), (499999, 0.0)]
([7, 0, 2], [7, 5, 2])
tensor(7.9634, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -7.863028526306152), (196607, 7.251187324523926), (499999, 0.0)]
([7, 0, 2], [7, 5, 2])
tensor(6.3788, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.096290349960327), (196607, 7.660953998565674), (499999, 0.0)]
([0, 0, 2], [7, 5, 2])
tensor(5.3448, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.1378746032714844), (196607, 11.546886444091797), (499999, 0.0)]
([0, 0, 2], [7, 5, 2])
tensor(5.1542, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.1810145378112793), (196607, 11.547258377075195), (499999, 0.0)]
([0, 0, 2], [7, 5, 2])
tensor(5.1511, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.25077748298645), (196607, 11.547492027282715), (499999, 0.0)]
([0, 0, 2], [7, 5, 2])
tensor(5.1496, device

 97%|█████████▋| 700/725 [23:17:29<55:44, 133.78s/it]

test：0.0, test mean: 0.2429718875502008
([1, 0, 8], [1, 0, 8])
tensor(3.9576, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 97%|█████████▋| 701/725 [23:17:55<40:32, 101.36s/it]

([3, 0, 2], [3, 0, 2])
tensor(5.7282, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 97%|█████████▋| 702/725 [23:18:21<30:13, 78.85s/it] 

([0, 0, 1], [0, 0, 1])
tensor(2.9633, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.008119793608784676), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(3.0460, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -0.05679363012313843), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(2.9450, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.09565617889165878), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(2.9499, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.36758267879486084), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(2.9370, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.388916552066803), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(2.9367, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.4062974154949188), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(2.9365, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 0.4204291105270386), (4

 97%|█████████▋| 703/725 [23:22:52<50:03, 136.53s/it]

test：1.0, test mean: 0.246
([1, 1, 1], [1, 1, 1])
tensor(6.1989, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.005608608014881611), (499999, 0.0), (499999, 0.0)]


 97%|█████████▋| 704/725 [23:23:19<36:13, 103.52s/it]

([0, 1, 2], [0, 1, 2])
tensor(7.3897, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 97%|█████████▋| 705/725 [23:23:47<26:58, 80.94s/it] 

([3, 5, 6], [3, 8, 6])
tensor(3.8271, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 97%|█████████▋| 706/725 [23:24:14<20:28, 64.65s/it]

([0, 7, 2], [0, 7, 2])
tensor(8.9683, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21021974086761475), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(8.7713, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -8.963220596313477), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(7.0963, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.745434522628784), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(5.5351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.2299036979675293), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(5.2621, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.4409773349761963), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(5.2073, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.1289846897125244), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(5.1908, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.2099180221557617), (4

 98%|█████████▊| 707/725 [23:29:00<39:19, 131.06s/it]

test：0.0, test mean: 0.2450199203187251
([0, 2, 0], [0, 2, 0])
tensor(4.1488, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 708/725 [23:29:26<28:11, 99.51s/it] 

([2, 2, 2], [2, 2, 2])
tensor(9.5944, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 709/725 [23:29:52<20:39, 77.50s/it]

([1, 1, 2], [1, 1, 2])
tensor(8.2041, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 710/725 [23:30:17<15:29, 61.96s/it]

([8, 5, 6], [8, 5, 6])
tensor(4.3840, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 711/725 [23:30:42<11:51, 50.80s/it]

([2, 5, 0], [2, 5, 0])
tensor(6.4834, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 0.14786683022975922), (442367, 0.000931035727262497)]
([2, 1, 0], [2, 5, 0])
tensor(5.6574, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 0.014224708080291748), (442367, 1.0586799383163452)]
([2, 0, 0], [2, 5, 0])
tensor(4.9531, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 12.802324295043945), (442367, 0.9246729016304016)]
([2, 0, 0], [2, 5, 0])
tensor(4.9543, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 12.797657012939453), (442367, 0.5174693465232849)]
([2, 0, 0], [2, 5, 0])
tensor(4.7839, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 12.851652145385742), (442367, 0.5291990041732788)]
([2, 0, 0], [2, 5, 0])
tensor(4.9274, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 12.839300155639648), (442367, 1.2509610652923584)]
([2, 0, 0], [2, 5, 0])
tensor(4.7736, dev

 98%|█████████▊| 712/725 [23:35:26<26:08, 120.64s/it]

test：0.5, test mean: 0.24603174603174602
([2, 5, 8], [2, 8, 8])
tensor(4.8927, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 713/725 [23:35:51<18:23, 91.99s/it] 

([1, 5, 2], [1, 8, 2])
tensor(8.2863, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 714/725 [23:36:16<13:10, 71.87s/it]

([8, 2, 2], [8, 2, 2])
tensor(8.9615, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 99%|█████████▊| 715/725 [23:36:42<09:42, 58.21s/it]

([1, 5, 2], [1, 5, 2])
tensor(8.0643, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 99%|█████████▉| 716/725 [23:37:09<07:17, 48.63s/it]

([1, 1, 5], [1, 1, 5])
tensor(5.0503, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0054917121306061745), (499999, 0.0)]


 99%|█████████▉| 717/725 [23:37:33<05:32, 41.53s/it]

([5, 2, 0], [5, 2, 0])
tensor(6.7678, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00013459200272336602), (499999, 0.0), (499999, 0.0)]


 99%|█████████▉| 718/725 [23:37:58<04:15, 36.51s/it]

([1, 0, 2], [3, 0, 2])
tensor(4.5209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, -0.7339420914649963), (491519, -0.008035179227590561), (499999, 0.0)]
([1, 0, 2], [3, 0, 2])
tensor(4.5429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, -1.7590663433074951), (491519, -0.8921264410018921), (499999, 0.0)]
([1, 0, 2], [3, 0, 2])
tensor(4.4684, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, -3.9651265144348145), (491519, -0.9301724433898926), (499999, 0.0)]
([7, 0, 2], [3, 0, 2])
tensor(4.5867, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, -1.8984454870224), (491519, -1.6091359853744507), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(4.1865, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, -4.904179096221924), (491519, -1.5761566162109375), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(4.0875, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, -3.2019271850585938), (491519, -1.594560146331787), (499999, 0.0)]
([0, 0, 2], [3, 0, 2])
tensor(4.0182,

 99%|█████████▉| 719/725 [23:42:25<10:32, 105.45s/it]

test：0.5, test mean: 0.24703557312252963
([1, 3, 7], [1, 3, 7])
tensor(7.0282, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.21019497513771057)]
([1, 3, 7], [1, 3, 7])
tensor(7.0053, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -9.09211540222168)]
([1, 3, 7], [1, 3, 7])
tensor(5.4529, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.025292873382568)]
([1, 3, 0], [1, 3, 7])
tensor(4.1292, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.049285888671875)]
([1, 3, 0], [1, 3, 7])
tensor(4.0519, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.357429504394531)]
([1, 3, 0], [1, 3, 7])
tensor(4.0344, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -4.490245342254639)]
([1, 3, 0], [1, 3, 7])
tensor(4.0293, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.

 99%|█████████▉| 720/725 [23:47:35<13:54, 166.99s/it]

test：0.0, test mean: 0.24606299212598426
([8, 2, 5], [8, 2, 1])
tensor(5.1903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.00010173760529141873)]


 99%|█████████▉| 721/725 [23:47:59<08:16, 124.10s/it]

([7, 7, 2], [7, 7, 2])
tensor(9.4446, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.21019040048122406), (380927, -0.3589375913143158), (499999, 0.0)]
([7, 7, 2], [7, 7, 2])
tensor(9.8432, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -9.091961860656738), (380927, -1.3430509567260742), (499999, 0.0)]
([7, 7, 2], [7, 7, 2])
tensor(6.3972, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.901113986968994), (380927, 6.136460304260254), (499999, 0.0)]
([0, 7, 2], [7, 7, 2])
tensor(3.2373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.817356109619141), (380927, 5.36474609375), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(3.0315, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.037919521331787), (380927, 6.442840576171875), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(2.9903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.179906845092773), (380927, 7.579498767852783), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(2.9729, device='cuda:

100%|█████████▉| 722/725 [23:52:16<08:11, 163.98s/it]

test：0.0, test mean: 0.24509803921568626
([7, 2, 8], [7, 2, 8])
tensor(6.6600, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


100%|█████████▉| 723/725 [23:52:41<04:04, 122.28s/it]

([1, 7, 0], [1, 7, 1])
tensor(5.6549, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.21022096276283264), (331775, -0.06582143902778625)]
([1, 7, 1], [1, 7, 1])
tensor(5.8740, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -9.797591209411621), (331775, 13.135656356811523)]
([1, 7, 0], [1, 7, 1])
tensor(3.7770, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.0870680809021), (331775, 14.285801887512207)]
([1, 7, 1], [1, 7, 1])
tensor(2.5144, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.23506850004196167), (331775, 25.37211036682129)]
([1, 0, 0], [1, 7, 1])
tensor(1.7693, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.22106504440307617), (331775, 26.410747528076172)]
([1, 0, 0], [1, 7, 1])
tensor(1.6995, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.4983872175216675), (331775, 26.808582305908203)]
([1, 0, 0], [1, 7, 1])
tensor(1.6770, devi

100%|█████████▉| 724/725 [23:57:21<02:49, 169.46s/it]

test：0.0, test mean: 0.24414062500000003
([0, 2], [0, 2])
tensor(6.0584, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0)]


100%|██████████| 725/725 [23:57:41<00:00, 118.98s/it]


In [15]:
# if __name__=='__main__':
#     acc = []
#     preds = []
#     labels = []

#     for step, val_batch_data in enumerate(validloader):
#         print(step,len(validloader))
#         cur_batch_size = val_batch_data[0].size(0)

#         exe_input = val_batch_data[0].to(device) if use_gpu else val_batch_data[0]

#         label = val_batch_data[1].to(device) if use_gpu else val_batch_data[1]
#         fake_label = torch.zeros_like(label.squeeze())
#         # fake_label[fake_label == 9] = 0
#         # temp = torch.zeros((6,9))
#         # for i,j in enumerate(fake_label): temp[i][j] = 1
#         # fake_label = temp.cuda()
#         # fake_label = fake_label.reshape(-1,1)
#         label = label.squeeze() - 1
        
#         pred,temp = malconv(exe_input,ce_loss,fake_label,label)
        
#         pred = np.argmax(pred,1)
#         label = label.cpu().data.numpy().astype(int)
#         # temp_grad,temp = malconv(exe_input)
#         preds.extend(pred.tolist())
#         labels.extend(label.tolist())

#         val_Macc = (label == pred).sum()
#         val_Macc = val_Macc / cur_batch_size
#         acc.append(val_Macc)
#         print(f"test：{val_Macc}, test mean: {np.mean(acc)}")

In [16]:
temp.grad[0,123:].sum(-1).sum(-1)

tensor(3.2881, device='cuda:0')