In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import multiprocessing as mp
from datetime import datetime
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.optim import Adam
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
# from src.model import *
# from src.util import *
os.environ["CUDA_VISIBLE_DEVICES"] = '3'

In [2]:
trained_model_path = '/workdir/security/home/junjiehuang2468/paper/trained_models_weight/kaggle_miscrosoft/'
best_trained_model = '2022-01-18 19:10/50w_epoch:15_test_acc:0.961822.pt'
data_path = "/workdir/security/home/junjiehuang2468/paper/data/kaggle/"
train_data_path = data_path + "malwares/"  # Training data
train_label_path = data_path + "train_labels.csv"  # Training label

In [3]:
result_path = '/workdir/security/home/junjiehuang2468/paper/results/kaggle_miscrosoft/'

In [4]:
CUDA = True if torch.cuda.is_available() else False
NUM_WORKERS = 16  # Number of cores to use for data loader
BATCH_SIZE = 3  #
LEAVE_BIT_NUMBER = 500000
KERNEL_SIZE = 500  # Kernel size & stride for Malconv (defualt : 500)

In [5]:
trainset = pd.read_csv(data_path + 'train_dataset.csv')
validset = pd.read_csv(data_path + 'valid_dataset.csv')

In [6]:
class ExeDataset(Dataset):
    def __init__(self, malware_names, data_path, labels, leave_bit_num):
        self.malware_names = malware_names
        self.data_path = data_path
        self.labels = labels
        self.leave_bit_num = leave_bit_num

    def __len__(self):
        return len(self.malware_names)

    def __getitem__(self, idx):
        with open(self.data_path + self.malware_names[idx] + '.txt','rb') as fp:
            data = [bit+1 for bit in fp.read()[:self.leave_bit_num]]
            padding = [0]*(self.leave_bit_num-len(data))
            data = data + padding

        return np.array(data), np.array([self.labels[idx]])

In [7]:
train_dataset = ExeDataset(
    trainset["id"].tolist(), 
    train_data_path, 
    trainset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)
valid_dataset = ExeDataset(
    validset["id"].tolist(), 
    train_data_path, 
    validset["labels"].tolist(), 
    LEAVE_BIT_NUMBER
)

In [8]:
trainloader = DataLoader(
    dataset = train_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = NUM_WORKERS,
    pin_memory = True
)
validloader = DataLoader(
    dataset = valid_dataset,
    batch_size = BATCH_SIZE,
    shuffle = True,
    num_workers = NUM_WORKERS,
    pin_memory = True
)

In [9]:
def mp_func(i,inpu,te,gr):
    check = 0
    grad_cum = 0
    for j,(inp,g,t) in enumerate(zip(inpu,gr,te)):
        if inp != 0: 
            check = j
            continue
        max_idx = np.argmin(g).tolist()
        org_max_idx = np.argmax(t).tolist()
        if g[max_idx] > 0: continue
        grad_cum += g[max_idx]
        te[j][org_max_idx] = 0
        te[j][max_idx] = 1
    return [i,te,check,grad_cum]

In [10]:
# class Model(nn.Module):
#     def __init__(self, data_length = 2e6, kernel_size = 500):
#         super().__init__()
#         self.embedding = nn.Embedding(257, 8, padding_idx=0)
#         self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         # self.bn_1 = nn.BatchNorm1d(128)
#         self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
#         self.fc_layer_3 = nn.Linear(128, 128)
#         self.fc_layer_4 = nn.Linear(128, 9)
        
#     def forward(self,x):
#         x = self.embedding(x)
#         x = x.transpose(-1,-2)
#         x_conv_1 = self.conv_layer_1(x[:,:4,:])
#         x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))
#         x = x_conv_1*x_conv_2
#         del x_conv_1,x_conv_2
#         x = self.pool_layer_2(x).squeeze()
#         x = self.fc_layer_3(x)
#         x = self.fc_layer_4(x)
#         # x = torch.sigmoid(x)
#         return x

In [11]:
class Model(nn.Module):
    def __init__(self, data_length = 2e6, kernel_size = 500):
        super().__init__()
        self.embedding = nn.Embedding(257, 8, padding_idx=0)
        self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        # self.bn_1 = nn.BatchNorm1d(128)
        self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
        self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
        self.fc_layer_3 = nn.Linear(128, 128)
        self.fc_layer_4 = nn.Linear(128, 9)
    
    def forward(self, input_, loss_fn, fake_label, label):
        temp = F.one_hot(input_,num_classes=257).float()
        temp.requires_grad = True
        temp.retain_grad()
        for _ in range(10):
            x = temp @ self.embedding.weight
            x = x.transpose(-1,-2)
            x_conv_1 = self.conv_layer_1(x[:,:4,:])
            x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))
            x = x_conv_1*x_conv_2
            del x_conv_1,x_conv_2
            x = self.pool_layer_2(x).squeeze()
            x = self.fc_layer_3(x)
            x = self.fc_layer_4(x)
            # fake_label = torch.zeros_like(x)
            print((torch.argmax(torch.softmax(x,dim=-1),dim=-1).tolist() , label.tolist()))
            loss = loss_fn(x,fake_label).cuda()
            print(loss)
            loss.backward()
            data = [(i,inpu,te,gr) for i,(inpu,te,gr) in enumerate(zip(
                input_.detach().cpu().numpy(),
                temp.detach().cpu().numpy(),
                temp.grad.detach().cpu().numpy()
            ))]
            with mp.Pool(processes=24 if len(data) > 24 else len(data)) as pool:
                results = pool.starmap(mp_func,data)
            
            check = [r[2] for r in results]
            results = sorted(results,key = lambda x: x[0])
            print([(r[2],temp.grad[i,r[2]:,:].sum(-1).sum(-1).tolist()) for i,r in enumerate(results)])
            if sum(check)/len(check) == LEAVE_BIT_NUMBER-1: break
            for i in range(len(temp)):
                temp.data[i] = torch.tensor(results[i][1], dtype=torch.float, requires_grad=True).cuda()
                
        return x.cpu().detach().numpy(),temp,check

In [12]:
# class Model(nn.Module):
#     def __init__(self, data_length = 2e6, kernel_size = 500):
#         super().__init__()
#         self.embedding = nn.Embedding(257, 8, padding_idx=0)
#         self.conv_layer_1 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         # self.bn_1 = nn.BatchNorm1d(128)
#         self.conv_layer_2 = nn.Conv1d(4, 128, kernel_size, stride = kernel_size, bias = True)
#         self.pool_layer_2 = nn.MaxPool1d(data_length//kernel_size)
#         self.fc_layer_3 = nn.Linear(128, 128)
#         self.fc_layer_4 = nn.Linear(128, 9)
    
#     def forward(self, input_, loss_fn):
#         temp = F.one_hot(input_,num_classes=257).float()
#         temp.requires_grad = True
#         temp.retain_grad()
#         for _ in range(6):
#             x = temp @ self.embedding.weight
#             x = torch.transpose(x, -1, -2)

#             x_conv_1 = self.conv_layer_1(x[:,:4,:])
#             x_conv_2 = torch.sigmoid(self.conv_layer_2(x[:,4:,:]))

#             x = x_conv_1*x_conv_2
#             del x_conv_1,x_conv_2
#             x = self.pool_layer_2(x).squeeze()
            
#             x = self.fc_layer_3(x)
#             x = self.fc_layer_4(x)
            
#             print(torch.argmax(torch.softmax(x,dim=-1),dim=-1).float().mean())
            
#             mislead_labels = torch.zeros_like(x).cuda()
#             loss = loss_fn(x,mislead_labels)
#             loss.backward()
            
#             data = [(i,inpu,te,gr) for i,(inpu,te,gr) in enumerate(zip(
#                 input_.detach().cpu().numpy(),
#                 temp.detach().cpu().numpy(),
#                 temp.grad.detach().cpu().numpy()
#             ))]
#             with mp.Pool(processes=24 if len(data) > 24 else len(data)) as pool:
#                 results = pool.starmap(mp_func,data)
#             results = sorted(results,key = lambda x: x[0])
#             for i in range(len(temp)):
#                 temp.data[i] = torch.tensor(results[i][1], dtype=torch.float, requires_grad=True).cuda()
        
#         return x.detach().cpu().numpy(),temp

In [13]:
model = Model(data_length=LEAVE_BIT_NUMBER,kernel_size=KERNEL_SIZE)

ce_loss = nn.CrossEntropyLoss()
optim = Adam(model.parameters())

model = model.cuda() if CUDA else model
ce_loss = ce_loss.cuda() if CUDA else ce_less

In [14]:
model.load_state_dict(torch.load(trained_model_path + best_trained_model))

<All keys matched successfully>

In [15]:
if __name__=='__main__':
    model.eval()
    acc = []
    preds = []
    labels = []
    for step, (batch_data,batch_label) in enumerate(tqdm(validloader)):
        batch_data = batch_data.cuda() if CUDA else batch_data
        batch_label = batch_label.cuda() if CUDA else batch_label
        batch_label = batch_label.squeeze() - 1
        fake_label = torch.zeros_like(batch_label)
        pred,temp,check = model(batch_data,ce_loss,fake_label,batch_label)
        if sum(check)/len(check) == LEAVE_BIT_NUMBER-1: continue
            
        pred = np.argmax(pred,1)
        batch_label = batch_label.cpu().data.numpy()

        preds.extend(pred.tolist())
        labels.extend(batch_label.tolist())

        count = total = 0
        for c,ans in zip(check,batch_label == pred):
            if c != LEAVE_BIT_NUMBER-1:
                count += ans
                total += 1
        acc.append(count/total)
        print(f"test：{count/total}, test mean: {np.mean(acc)}")

  0%|          | 0/725 [00:00<?, ?it/s]

([6, 0, 2], [6, 7, 2])
tensor(7.9986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  0%|          | 1/725 [00:31<6:25:11, 31.92s/it]

([8, 2, 8], [8, 2, 8])
tensor(17.3843, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  0%|          | 2/725 [00:57<5:38:11, 28.07s/it]

([1, 1, 6], [1, 1, 6])
tensor(13.4393, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  0%|          | 3/725 [01:21<5:16:23, 26.29s/it]

([3, 1, 5], [3, 8, 5])
tensor(10.3210, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -17.97531509399414), (499999, 0.0), (499999, 0.0)]
([3, 1, 5], [3, 8, 5])
tensor(9.6533, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -34.26577377319336), (499999, 0.0), (499999, 0.0)]
([3, 1, 5], [3, 8, 5])
tensor(8.2758, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -46.634769439697266), (499999, 0.0), (499999, 0.0)]
([3, 1, 5], [3, 8, 5])
tensor(7.2796, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -54.992820739746094), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [3, 8, 5])
tensor(6.9140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -55.990806579589844), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [3, 8, 5])
tensor(6.8081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -51.619022369384766), (499999, 0.0), (499999, 0.0)]
([0, 1, 5], [3, 8, 5])
tensor(6.7081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -52.69575881958008), (499999, 0.0), (4

  1%|          | 4/725 [05:43<23:52:35, 119.22s/it]

test：0.0, test mean: 0.0
([1, 6, 2], [1, 6, 2])
tensor(18.3228, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 5/725 [06:10<17:11:26, 85.95s/it] 

([7, 0, 2], [7, 0, 2])
tensor(14.9195, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 3.1132798194885254), (442367, -1.7518399545224383e-05), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(13.2304, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.522191047668457), (442367, -3.515425487421453e-05), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(11.3917, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.562060356140137), (442367, -5.278167736832984e-05), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(8.8859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.629251480102539), (442367, -4.7499641368631274e-05), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(8.8859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.696442604064941), (442367, -5.948098987573758e-05), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(8.8859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -5.763635635375977), (442367, -8.5242441855371e-05), (499999, 0.0)]
([0, 0, 2], [7, 0, 2

  1%|          | 6/725 [10:43<29:51:54, 149.53s/it]

test：0.5, test mean: 0.25
([8, 2, 0], [8, 2, 0])
tensor(20.6517, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.03132149577140808), (499999, 0.0)]


  1%|          | 7/725 [11:07<21:40:27, 108.67s/it]

([5, 2, 2], [5, 2, 2])
tensor(20.4112, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 8/725 [11:33<16:22:23, 82.21s/it] 

([2, 2, 7], [2, 2, 7])
tensor(27.2406, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9423112869262695)]
([2, 2, 7], [2, 2, 7])
tensor(25.5556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 12.906265258789062)]
([2, 2, 7], [2, 2, 7])
tensor(23.4994, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 9.478619575500488)]
([2, 2, 0], [2, 2, 7])
tensor(19.1563, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 14.23872184753418)]
([2, 2, 0], [2, 2, 7])
tensor(19.1322, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 14.116445541381836)]
([2, 2, 0], [2, 2, 7])
tensor(19.1322, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 13.994169235229492)]
([2, 2, 0], [2, 2, 7])
tensor(19.1318, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 13.89537

  1%|          | 9/725 [16:34<29:58:20, 150.70s/it]

test：0.0, test mean: 0.16666666666666666
([3, 8, 2], [5, 8, 2])
tensor(12.4641, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|▏         | 10/725 [16:59<22:11:47, 111.76s/it]

([0, 7, 5], [0, 7, 5])
tensor(11.8688, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.7659730911254883), (499999, 0.0)]
([0, 7, 5], [0, 7, 5])
tensor(8.8631, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 17.73682975769043), (499999, 0.0)]
([0, 7, 5], [0, 7, 5])
tensor(6.5873, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 115.19287109375), (499999, 0.0)]
([0, 0, 5], [0, 7, 5])
tensor(3.5256, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 115.38324737548828), (499999, 0.0)]
([0, 0, 5], [0, 7, 5])
tensor(3.5230, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 115.52745819091797), (499999, 0.0)]
([0, 0, 5], [0, 7, 5])
tensor(3.5211, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 115.56643676757812), (499999, 0.0)]
([0, 0, 5], [0, 7, 5])
tensor(3.5200, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 115.55059814453125), (499999, 0

  2%|▏         | 11/725 [21:35<32:08:05, 162.02s/it]

test：0.0, test mean: 0.125
([0, 8, 2], [0, 8, 2])
tensor(13.1549, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 12/725 [22:02<23:58:14, 121.03s/it]

([1, 3, 4], [1, 3, 4])
tensor(13.0024, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 13/725 [22:27<18:13:13, 92.13s/it] 

([2, 1, 2], [2, 1, 2])
tensor(24.9575, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.005405490752309561), (499999, 0.0)]


  2%|▏         | 14/725 [22:55<14:20:21, 72.60s/it]

([1, 7, 7], [1, 7, 7])
tensor(16.8562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.942309617996216), (499999, 0.0)]
([1, 7, 7], [1, 7, 7])
tensor(14.5656, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 14.431379318237305), (499999, 0.0)]
([1, 7, 7], [1, 7, 7])
tensor(12.0572, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.865626335144043), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(10.2661, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.863463401794434), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(10.2661, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.861299514770508), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(10.2661, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.859139442443848), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(10.2661, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -8.856977462768555), (

  2%|▏         | 15/725 [27:33<26:33:15, 134.64s/it]

test：0.0, test mean: 0.1
([5, 7, 5], [5, 7, 5])
tensor(13.9088, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.029038066044449806), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 16/725 [27:59<20:03:07, 101.82s/it]

([0, 2, 7], [0, 2, 7])
tensor(12.5281, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9181621074676514)]
([0, 2, 7], [0, 2, 7])
tensor(10.5577, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 20.601123809814453)]
([0, 2, 7], [0, 2, 7])
tensor(8.7613, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 13.675745010375977)]
([0, 2, 0], [0, 2, 7])
tensor(6.5926, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 13.669595718383789)]
([0, 2, 0], [0, 2, 7])
tensor(6.5926, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 13.663448333740234)]
([0, 2, 0], [0, 2, 7])
tensor(6.5926, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 13.657379150390625)]
([0, 2, 0], [0, 2, 7])
tensor(6.5926, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 13.65131378

  2%|▏         | 17/725 [32:50<31:12:58, 158.73s/it]

test：0.0, test mean: 0.08333333333333333
([2, 1, 2], [2, 1, 2])
tensor(25.4140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 18/725 [33:18<23:26:00, 119.32s/it]

([7, 1, 8], [7, 1, 8])
tensor(16.8381, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 19/725 [33:44<17:55:27, 91.40s/it] 

([1, 0, 1], [1, 0, 1])
tensor(14.3975, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -6.786143057979643e-05), (499999, 0.0)]


  3%|▎         | 20/725 [34:10<14:02:36, 71.71s/it]

([3, 1, 5], [3, 1, 5])
tensor(11.0236, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 21/725 [34:36<11:21:55, 58.12s/it]

([2, 1, 7], [2, 1, 7])
tensor(24.6473, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.016709527000784874), (380927, 2.9181621074676514)]
([2, 1, 7], [2, 1, 7])
tensor(22.5841, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.03341905400156975), (380927, 21.84639549255371)]
([2, 1, 7], [2, 1, 7])
tensor(21.7590, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.05012857913970947), (380927, 20.22881317138672)]
([2, 1, 0], [2, 1, 7])
tensor(18.0183, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0668381080031395), (380927, 17.570005416870117)]
([2, 1, 0], [2, 1, 7])
tensor(17.9928, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.08354762196540833), (380927, 17.948219299316406)]
([2, 1, 0], [2, 1, 7])
tensor(17.9917, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.10025715827941895), (380927, 18.106000900268555)]
([2, 1, 0], [2, 1, 7])
tensor(

  3%|▎         | 22/725 [39:35<25:29:04, 130.50s/it]

test：0.0, test mean: 0.07142857142857142
([1, 7, 2], [1, 7, 2])
tensor(23.4845, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.014130987226963043), (380927, 2.9181621074676514), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(20.1681, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.028261974453926086), (380927, 8.932491302490234), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(18.8444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.04239294305443764), (380927, 0.42215871810913086), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(14.6273, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.05652394890785217), (380927, 0.42178964614868164), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(14.6273, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.07065489143133163), (380927, 0.4220442771911621), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(14.6273, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.08478590846061707), (380927, 0.42229795455932617)

  3%|▎         | 23/725 [44:08<33:45:34, 173.13s/it]

test：0.0, test mean: 0.0625
([6, 1, 8], [6, 1, 8])
tensor(16.4683, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 24/725 [44:33<25:02:56, 128.64s/it]

([5, 8, 2], [5, 8, 2])
tensor(19.6386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 25/725 [44:57<18:55:52, 97.36s/it] 

([7, 6, 1], [7, 6, 1])
tensor(15.8510, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9423112869262695), (499999, 0.0), (499999, 0.0)]
([7, 6, 1], [7, 6, 1])
tensor(14.1503, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.525495529174805), (499999, 0.0), (499999, 0.0)]
([7, 6, 1], [7, 6, 1])
tensor(12.2580, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.23819780349731445), (499999, 0.0), (499999, 0.0)]
([0, 6, 1], [7, 6, 1])
tensor(8.2277, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -23.621841430664062), (499999, 0.0), (499999, 0.0)]
([0, 6, 1], [7, 6, 1])
tensor(7.9994, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -24.053955078125), (499999, 0.0), (499999, 0.0)]
([0, 6, 1], [7, 6, 1])
tensor(7.9994, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -24.486072540283203), (499999, 0.0), (499999, 0.0)]
([0, 6, 1], [7, 6, 1])
tensor(7.9994, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -24.918184280395508), (499999, 0.0), (

  4%|▎         | 26/725 [49:25<28:51:11, 148.60s/it]

test：0.0, test mean: 0.05555555555555555
([2, 2, 8], [2, 2, 8])
tensor(22.1537, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▎         | 27/725 [49:50<21:36:41, 111.46s/it]

([2, 6, 3], [2, 6, 3])
tensor(18.3266, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▍         | 28/725 [50:15<16:33:43, 85.54s/it] 

([7, 5, 7], [7, 5, 7])
tensor(17.5800, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.888364791870117), (499999, 0.0), (380927, 2.7659730911254883)]
([7, 5, 7], [7, 5, 7])
tensor(14.3505, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 25.09968376159668), (499999, 0.0), (380927, 10.125085830688477)]
([7, 5, 7], [7, 5, 7])
tensor(6.8764, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 23.71428680419922), (499999, 0.0), (380927, 49.629539489746094)]
([0, 5, 0], [7, 5, 7])
tensor(3.2489, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 23.714448928833008), (499999, 0.0), (380927, 49.5538330078125)]
([0, 5, 0], [7, 5, 7])
tensor(3.2483, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 23.714609146118164), (499999, 0.0), (380927, 49.50535583496094)]
([0, 5, 0], [7, 5, 7])
tensor(3.2481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 23.714771270751953), (499999, 0.0), (380927, 49.4520378112793)]
([0, 5, 0], [7, 5, 7])
tensor(3.2481, device='cuda:0

  4%|▍         | 29/725 [55:00<28:04:26, 145.21s/it]

test：0.0, test mean: 0.05
([2, 8, 0], [2, 8, 0])
tensor(15.7810, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, 0.0014023378025740385)]
([2, 8, 0], [2, 8, 0])
tensor(15.7810, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, 0.012996383011341095)]
([2, 8, 0], [2, 8, 0])
tensor(15.7808, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, 0.012641152366995811)]
([2, 8, 0], [2, 8, 0])
tensor(15.7808, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, 0.012494166381657124)]
([2, 8, 0], [2, 8, 0])
tensor(15.7808, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, 0.010439479723572731)]
([2, 8, 0], [2, 8, 0])
tensor(15.7808, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (405503, 0.009955784305930138)]
([2, 8, 0], [2, 8, 0])
tensor(15.7808, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49999

  4%|▍         | 30/725 [59:50<36:27:26, 188.84s/it]

test：1.0, test mean: 0.13636363636363635
([0, 0, 6], [1, 0, 6])
tensor(3.7399, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -3.0888184454624934e-08), (499999, 0.0)]
([0, 0, 6], [1, 0, 6])
tensor(3.7399, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -6.177636890924987e-08), (499999, 0.0)]
([0, 0, 6], [1, 0, 6])
tensor(3.7399, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -9.266454981116112e-08), (499999, 0.0)]
([0, 0, 6], [1, 0, 6])
tensor(3.7399, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -1.2355273781849974e-07), (499999, 0.0)]
([0, 0, 6], [1, 0, 6])
tensor(3.7399, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -1.5444092582583835e-07), (499999, 0.0)]
([0, 0, 6], [1, 0, 6])
tensor(3.7399, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -1.8532909962232225e-07), (499999, 0.0)]
([0, 0, 6], [1, 0, 6])
tensor(3.7399, device='cuda:0', grad_fn=<Nll

  4%|▍         | 31/725 [1:04:30<41:39:21, 216.08s/it]

test：1.0, test mean: 0.20833333333333334
([1, 8, 2], [1, 8, 2])
tensor(18.1440, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▍         | 32/725 [1:04:56<30:36:10, 158.98s/it]

([3, 2, 7], [3, 2, 7])
tensor(23.9152, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 2.9423108100891113)]
([3, 2, 7], [3, 2, 7])
tensor(21.5124, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 23.331283569335938)]
([3, 2, 7], [3, 2, 7])
tensor(19.4147, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.583951473236084)]
([3, 2, 0], [3, 2, 7])
tensor(18.1172, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.5841243267059326)]
([3, 2, 0], [3, 2, 7])
tensor(18.1172, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.584296703338623)]
([3, 2, 0], [3, 2, 7])
tensor(18.1172, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.5844669342041016)]
([3, 2, 0], [3, 2, 7])
tensor(18.1172, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.5

  5%|▍         | 33/725 [1:09:40<37:48:02, 196.65s/it]

test：0.0, test mean: 0.19230769230769232
([2, 0, 5], [2, 0, 5])
tensor(14.4845, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 34/725 [1:10:06<27:53:04, 145.27s/it]

([7, 5, 1], [7, 5, 1])
tensor(12.0125, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (499999, 0.0), (499999, 0.0)]
([7, 5, 1], [7, 5, 1])
tensor(10.4421, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 22.4570369720459), (499999, 0.0), (499999, 0.0)]
([7, 5, 1], [7, 5, 1])
tensor(6.8056, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 133.491455078125), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(5.8345, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 133.4173126220703), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(5.8333, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 133.38912963867188), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(5.8317, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 133.36734008789062), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(5.8313, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 133.3425750732422), (499999, 0.0), (499999, 0.

  5%|▍         | 35/725 [1:14:15<33:48:24, 176.38s/it]

test：0.0, test mean: 0.17857142857142858
([3, 1, 0], [3, 1, 0])
tensor(11.3835, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 36/725 [1:14:39<25:02:59, 130.89s/it]

([7, 1, 5], [7, 1, 5])
tensor(10.3893, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.01073096040636301)]


  5%|▌         | 37/725 [1:15:04<18:55:22, 99.02s/it] 

([7, 0, 8], [7, 0, 8])
tensor(9.1609, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.7659714221954346), (466943, 8.397930173487111e-07), (499999, 0.0)]
([7, 0, 8], [7, 0, 8])
tensor(6.7671, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 20.05219268798828), (466943, -7.20786061947365e-08), (499999, 0.0)]
([7, 0, 8], [7, 0, 8])
tensor(2.9775, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 48.10798645019531), (466943, -1.0297908374923281e-05), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tensor(2.6214, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 48.1080322265625), (466943, -1.2582126146298833e-05), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tensor(2.6214, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 48.10807800292969), (466943, 0.00012411152420099825), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tensor(2.6214, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 48.108123779296875), (466943, 0.00012411152420099825), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tens

  5%|▌         | 38/725 [1:19:19<27:49:38, 145.82s/it]

test：0.5, test mean: 0.2
([5, 2, 8], [5, 2, 8])
tensor(15.6764, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0011975764064118266), (499999, 0.0), (499999, 0.0)]


  5%|▌         | 39/725 [1:19:43<20:49:34, 109.29s/it]

([2, 0, 2], [2, 0, 2])
tensor(15.1591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.00017270246462430805), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(15.1591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.00025927548995241523), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(15.1593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.01915084943175316), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(15.1593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.02600257471203804), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(15.1591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.025992488488554955), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(15.1591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.026073619723320007), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(15.1591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -

  6%|▌         | 40/725 [1:24:03<29:23:18, 154.45s/it]

test：1.0, test mean: 0.25
([1, 7, 2], [1, 1, 2])
tensor(18.7108, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.009891888126730919)]


  6%|▌         | 41/725 [1:24:27<21:53:41, 115.24s/it]

([2, 1, 1], [2, 1, 1])
tensor(20.2194, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 42/725 [1:24:51<16:40:56, 87.93s/it] 

([2, 5, 8], [2, 0, 8])
tensor(17.5346, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 43/725 [1:25:15<13:03:10, 68.90s/it]

([7, 0, 1], [7, 0, 1])
tensor(11.5645, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.043602026998996735), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(11.5585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.03476863354444504), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(11.5584, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.03045511059463024), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(11.5583, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.025439323857426643), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(11.5583, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.021852951496839523), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(11.5583, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.01912509649991989), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(11.5583, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -0.016

  6%|▌         | 44/725 [1:29:22<23:06:40, 122.17s/it]

test：1.0, test mean: 0.29411764705882354
([7, 1, 2], [7, 1, 2])
tensor(23.3567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 45/725 [1:29:46<17:30:02, 92.65s/it] 

([8, 0, 2], [8, 0, 2])
tensor(13.6146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.231741180163226e-07), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(13.6146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 6.708463047289115e-07), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(13.6146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.7991475260714651e-06), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(13.6146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.7990579408433405e-06), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(13.6146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.7249617485504132e-06), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(13.6146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.7247550658794353e-06), (499999, 0.0)]
([8, 0, 2], [8, 0, 2])
tensor(13.6146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (46

  6%|▋         | 46/725 [1:34:02<26:42:45, 141.63s/it]

test：1.0, test mean: 0.3333333333333333
([0, 7, 1], [0, 1, 1])
tensor(10.0238, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▋         | 47/725 [1:34:26<20:02:18, 106.40s/it]

([5, 0, 5], [1, 0, 5])
tensor(5.6150, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -0.00036231096601113677), (499999, 0.0)]
([5, 0, 5], [1, 0, 5])
tensor(5.6152, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.06191345676779747), (499999, 0.0)]
([5, 0, 5], [1, 0, 5])
tensor(5.6148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.061913155019283295), (499999, 0.0)]
([5, 0, 5], [1, 0, 5])
tensor(5.6148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.06192617118358612), (499999, 0.0)]
([5, 0, 5], [1, 0, 5])
tensor(5.6148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.06192602589726448), (499999, 0.0)]
([5, 0, 5], [1, 0, 5])
tensor(5.6148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.06192588433623314), (499999, 0.0)]
([5, 0, 5], [1, 0, 5])
tensor(5.6148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.061925742775201

  7%|▋         | 48/725 [1:38:37<28:11:24, 149.90s/it]

test：1.0, test mean: 0.3684210526315789
([2, 7, 0], [2, 7, 0])
tensor(20.0539, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9423112869262695), (499999, 0.0)]
([2, 7, 0], [2, 7, 0])
tensor(18.2674, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 8.378218650817871), (499999, 0.0)]
([2, 7, 0], [2, 7, 0])
tensor(16.1517, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.9585695266723633), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(11.8597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.986594200134277), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(11.7843, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.989049434661865), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(11.7843, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.95027494430542), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(11.7840, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0

  7%|▋         | 49/725 [1:42:39<33:19:09, 177.44s/it]

test：0.0, test mean: 0.35
([0, 7, 3], [0, 7, 3])
tensor(6.8732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 6.188610768731451e-06), (499999, 0.0), (499999, 0.0)]
([0, 7, 3], [0, 7, 3])
tensor(6.8732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -4.24510108132381e-06), (499999, 0.0), (499999, 0.0)]
([0, 7, 3], [0, 7, 3])
tensor(6.8732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -4.245126092428109e-06), (499999, 0.0), (499999, 0.0)]
([0, 7, 3], [0, 7, 3])
tensor(6.8732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -4.245149284543004e-06), (499999, 0.0), (499999, 0.0)]
([0, 7, 3], [0, 7, 3])
tensor(6.8732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -4.245173386152601e-06), (499999, 0.0), (499999, 0.0)]
([0, 7, 3], [0, 7, 3])
tensor(6.8732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -4.245197942509549e-06), (499999, 0.0), (499999, 0.0)]
([0, 7, 3], [0, 7, 3])
tensor(6.8732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442

  7%|▋         | 50/725 [1:46:40<36:52:10, 196.64s/it]

test：1.0, test mean: 0.38095238095238093
([2, 0, 2], [2, 0, 2])
tensor(19.2462, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 51/725 [1:47:03<27:03:20, 144.51s/it]

([2, 0, 5], [2, 0, 5])
tensor(15.7328, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 52/725 [1:47:25<20:09:10, 107.80s/it]

([0, 1, 7], [0, 1, 7])
tensor(15.8522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9181621074676514)]
([0, 1, 7], [0, 1, 7])
tensor(12.3519, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 9.089452743530273)]
([0, 1, 7], [0, 1, 7])
tensor(10.8815, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 5.499117851257324)]
([0, 1, 0], [0, 1, 7])
tensor(6.7448, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 5.495545387268066)]
([0, 1, 0], [0, 1, 7])
tensor(6.7448, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 5.491968154907227)]
([0, 1, 0], [0, 1, 7])
tensor(6.7448, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 5.488395690917969)]
([0, 1, 0], [0, 1, 7])
tensor(6.7448, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 5.4848194122314

  7%|▋         | 53/725 [1:51:39<28:17:01, 151.52s/it]

test：0.0, test mean: 0.36363636363636365
([1, 8, 6], [1, 8, 6])
tensor(18.2727, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 54/725 [1:52:03<21:07:39, 113.35s/it]

([1, 1, 1], [1, 1, 1])
tensor(18.5352, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -8.224905014038086)]
([1, 1, 1], [1, 1, 1])
tensor(18.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -15.773492813110352)]
([1, 1, 1], [1, 1, 1])
tensor(17.7872, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -50.0513916015625)]
([1, 1, 1], [1, 1, 1])
tensor(16.9982, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -83.0165023803711)]
([1, 1, 1], [1, 1, 1])
tensor(16.7087, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -118.76154327392578)]
([1, 1, 1], [1, 1, 1])
tensor(16.4193, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -154.44801330566406)]
([1, 1, 1], [1, 1, 1])
tensor(15.7953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -189.

  8%|▊         | 55/725 [1:56:40<30:14:43, 162.51s/it]

test：0.0, test mean: 0.34782608695652173
([1, 2, 1], [1, 2, 1])
tensor(28.3025, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.021160071715712547), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 56/725 [1:57:03<22:25:17, 120.65s/it]

([2, 0, 1], [2, 0, 1])
tensor(19.2176, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 57/725 [1:57:26<16:54:51, 91.15s/it] 

([0, 2, 2], [0, 2, 2])
tensor(17.3253, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 58/725 [1:57:48<13:04:51, 70.60s/it]

([7, 1, 2], [7, 1, 2])
tensor(15.3250, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 59/725 [1:58:12<10:25:49, 56.38s/it]

([2, 0, 2], [2, 0, 2])
tensor(23.9003, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 60/725 [1:58:34<8:33:16, 46.31s/it] 

([6, 6, 1], [6, 6, 1])
tensor(14.4451, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 61/725 [1:58:57<7:14:24, 39.25s/it]

([0, 1, 0], [0, 1, 0])
tensor(7.9338, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▊         | 62/725 [1:59:20<6:21:04, 34.49s/it]

([5, 7, 7], [5, 7, 7])
tensor(19.7789, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (380927, 2.9423112869262695)]
([5, 7, 7], [5, 7, 7])
tensor(15.9133, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 21.694997787475586), (380927, 11.366283416748047)]
([5, 7, 7], [5, 7, 7])
tensor(12.4249, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 13.64938735961914), (380927, -6.562100410461426)]
([5, 0, 7], [5, 7, 7])
tensor(5.6590, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 13.547920227050781), (380927, -40.93817901611328)]
([5, 0, 0], [5, 7, 7])
tensor(5.2146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 13.466558456420898), (380927, -41.56340789794922)]
([5, 0, 0], [5, 7, 7])
tensor(5.2143, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 13.405599594116211), (380927, -42.161102294921875)]
([5, 0, 0], [5, 7, 7])
tensor(5.2141, devic

  9%|▊         | 63/725 [2:03:31<18:16:15, 99.36s/it]

test：0.0, test mean: 0.3333333333333333
([5, 1, 2], [5, 1, 2])
tensor(17.1424, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 64/725 [2:03:53<13:59:08, 76.17s/it]

([7, 5, 2], [7, 5, 2])
tensor(24.3123, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (499999, 0.0), (499999, 0.0)]
([7, 5, 2], [7, 5, 2])
tensor(22.6241, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.259552955627441), (499999, 0.0), (499999, 0.0)]
([7, 5, 2], [7, 5, 2])
tensor(19.7580, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 120.1441879272461), (499999, 0.0), (499999, 0.0)]
([7, 5, 2], [7, 5, 2])
tensor(16.9975, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 120.04300689697266), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(16.5752, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 120.27217102050781), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(16.5743, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 120.45404815673828), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(16.5742, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 120.63200378417969), (499999, 0.0), (

  9%|▉         | 65/725 [2:08:27<24:49:19, 135.39s/it]

test：0.0, test mean: 0.32
([2, 1, 7], [2, 1, 7])
tensor(24.2686, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9423112869262695)]
([2, 1, 7], [2, 1, 7])
tensor(22.4692, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 12.900809288024902)]
([2, 1, 7], [2, 1, 7])
tensor(20.6009, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 0.4924321174621582)]
([2, 1, 7], [2, 1, 7])
tensor(16.7650, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -32.71236038208008)]
([2, 1, 0], [2, 1, 7])
tensor(16.0953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -32.643524169921875)]
([2, 1, 0], [2, 1, 7])
tensor(16.0943, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -32.68640899658203)]
([2, 1, 0], [2, 1, 7])
tensor(16.0942, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (49

  9%|▉         | 66/725 [2:13:15<33:11:14, 181.30s/it]

test：0.0, test mean: 0.3076923076923077
([6, 5, 0], [6, 5, 0])
tensor(3.5221, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 67/725 [2:13:39<24:28:40, 133.92s/it]

([1, 0, 1], [1, 0, 1])
tensor(8.2977, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 68/725 [2:14:02<18:24:20, 100.85s/it]

([2, 0, 7], [2, 0, 7])
tensor(18.3105, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|▉         | 69/725 [2:14:26<14:09:40, 77.71s/it] 

([2, 7, 1], [2, 7, 1])
tensor(22.9136, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.7659730911254883), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(21.0657, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 12.643152236938477), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(18.0282, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 120.45036315917969), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(15.8030, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 116.5662841796875), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(15.1162, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 117.27545166015625), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(15.1043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 117.53965759277344), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(15.1012, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 117.68830871582031), (

 10%|▉         | 70/725 [2:18:42<23:51:01, 131.09s/it]

test：0.0, test mean: 0.2962962962962963
([8, 1, 7], [8, 1, 7])
tensor(15.4373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -7.4367570877075195), (368639, 2.765972852706909)]
([8, 1, 7], [8, 1, 7])
tensor(13.5940, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -13.653286933898926), (368639, 44.117088317871094)]
([8, 1, 7], [8, 1, 7])
tensor(11.3014, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -19.869815826416016), (368639, 150.97015380859375)]
([8, 1, 0], [8, 1, 7])
tensor(9.4110, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -26.086347579956055), (368639, 150.981201171875)]
([8, 1, 0], [8, 1, 7])
tensor(9.4110, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -32.30287551879883), (368639, 150.99224853515625)]
([8, 1, 0], [8, 1, 7])
tensor(9.4110, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -38.519405364990234), (368639, 151.00332641601562)]
(

 10%|▉         | 71/725 [2:23:37<32:46:34, 180.42s/it]

test：0.5, test mean: 0.30357142857142855
([1, 2, 0], [1, 2, 0])
tensor(14.8579, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -7.435500144958496), (499999, 0.0), (499999, 0.0)]
([1, 2, 0], [1, 2, 0])
tensor(14.8597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -13.966899871826172), (499999, 0.0), (499999, 0.0)]
([1, 2, 0], [1, 2, 0])
tensor(14.8597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -20.49829864501953), (499999, 0.0), (499999, 0.0)]
([1, 2, 0], [1, 2, 0])
tensor(14.8597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -27.029701232910156), (499999, 0.0), (499999, 0.0)]
([1, 2, 0], [1, 2, 0])
tensor(14.8597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -33.561100006103516), (499999, 0.0), (499999, 0.0)]
([1, 2, 0], [1, 2, 0])
tensor(14.8597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -40.092498779296875), (499999, 0.0), (499999, 0.0)]
([1, 2, 0], [1, 2, 0])
tensor(14.8597, device='cuda:0', grad_fn=<NllLossBackward0>)
[

 10%|▉         | 72/725 [2:28:10<37:44:14, 208.05s/it]

test：1.0, test mean: 0.3275862068965517
([0, 6, 8], [0, 6, 8])
tensor(7.1180, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|█         | 73/725 [2:28:33<27:37:39, 152.55s/it]

([1, 0, 7], [1, 0, 7])
tensor(16.8909, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9181621074676514)]
([1, 0, 7], [1, 0, 7])
tensor(13.5063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 12.905088424682617)]
([1, 0, 7], [1, 0, 7])
tensor(12.3218, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 4.9865031242370605)]
([1, 0, 0], [1, 0, 7])
tensor(7.9899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 4.97460412979126)]
([1, 0, 0], [1, 0, 7])
tensor(7.9895, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 4.967017650604248)]
([1, 0, 0], [1, 0, 7])
tensor(7.9891, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 4.967531204223633)]
([1, 0, 0], [1, 0, 7])
tensor(7.9891, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 4.968043327331

 10%|█         | 74/725 [2:32:52<33:23:29, 184.65s/it]

test：0.0, test mean: 0.31666666666666665
([8, 2, 2], [8, 2, 2])
tensor(30.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|█         | 75/725 [2:33:17<24:40:47, 136.69s/it]

([1, 2, 1], [1, 2, 1])
tensor(20.7840, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|█         | 76/725 [2:33:40<18:29:42, 102.59s/it]

([2, 2, 2], [2, 2, 2])
tensor(18.7907, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 77/725 [2:34:03<14:10:19, 78.73s/it] 

([0, 1, 2], [0, 1, 2])
tensor(14.9122, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.00012129350216127932), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(14.9149, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.4038134217262268), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(14.9121, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.4038247764110565), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(14.9121, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.40382295846939087), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(14.9121, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.4037238359451294), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(14.9121, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.4036402702331543), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(14.9121, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.40356218814849854), (499999,

 11%|█         | 78/725 [2:38:21<23:46:45, 132.31s/it]

test：1.0, test mean: 0.3387096774193548
([8, 2, 2], [8, 2, 2])
tensor(19.7729, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -11.809257507324219), (499999, 0.0), (499999, 0.0)]
([8, 2, 2], [8, 2, 2])
tensor(20.0988, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -20.774173736572266), (499999, 0.0), (499999, 0.0)]
([8, 2, 2], [8, 2, 2])
tensor(20.0154, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -34.729347229003906), (499999, 0.0), (499999, 0.0)]
([8, 2, 2], [8, 2, 2])
tensor(19.3392, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -45.35823059082031), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [8, 2, 2])
tensor(19.0394, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -51.3652458190918), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [8, 2, 2])
tensor(18.9360, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -54.071434020996094), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [8, 2, 2])
tensor(18.9132, device='cuda:0', grad_fn=<NllLossBackward0>)
[(4

 11%|█         | 79/725 [2:42:57<31:29:41, 175.51s/it]

test：0.0, test mean: 0.328125
([2, 0, 1], [2, 0, 1])
tensor(15.5106, device='cuda:0', grad_fn=<NllLossBackward0>)


Process ForkPoolWorker-1119:
Process ForkPoolWorker-1118:
Process ForkPoolWorker-1120:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.6/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 334, in get
    with self._rlock:
 11%|█         | 79/725 [2:43:17<22:15:18, 124.02s/it]


KeyboardInterrupt: 

In [14]:
if __name__=='__main__':
    model.eval()
    acc = []
    preds = []
    labels = []
    for step, (batch_data,batch_label) in enumerate(tqdm(validloader)):
        batch_data = batch_data.cuda() if CUDA else batch_data
        batch_label = batch_label.cuda() if CUDA else batch_label
        batch_label = batch_label.squeeze() - 1
        fake_label = torch.zeros_like(batch_label)
        pred,temp,check = model(batch_data,ce_loss,fake_label,batch_label)
        if sum(check)/len(check) == LEAVE_BIT_NUMBER-1: continue
            
        pred = np.argmax(pred,1)
        batch_label = batch_label.cpu().data.numpy()

        preds.extend(pred.tolist())
        labels.extend(batch_label.tolist())

        count = total = 0
        for c,ans in zip(check,batch_label == pred):
            if c != LEAVE_BIT_NUMBER-1:
                count += ans
                total += 1
        acc.append(count/total)
        print(f"test：{count/total}, test mean: {np.mean(acc)}")

  0%|          | 0/725 [00:00<?, ?it/s]

([1, 0, 5], [1, 0, 5])
tensor(10.2925, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  0%|          | 1/725 [00:27<5:30:51, 27.42s/it]

([1, 0, 2], [1, 0, 2])
tensor(14.2205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  0%|          | 2/725 [00:52<5:13:44, 26.04s/it]

([1, 2, 1], [1, 2, 3])
tensor(15.4146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.015333756804466248)]


  0%|          | 3/725 [01:17<5:05:25, 25.38s/it]

([2, 8, 8], [2, 8, 8])
tensor(25.0156, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 4/725 [01:40<4:57:43, 24.78s/it]

([2, 8, 1], [2, 8, 1])
tensor(21.3538, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 5/725 [02:08<5:07:16, 25.61s/it]

([0, 0, 8], [0, 0, 8])
tensor(2.6386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 3.7432796489156317e-06), (499999, 0.0)]
([0, 0, 8], [0, 0, 8])
tensor(2.6386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 5.650486855302006e-05), (499999, 0.0)]
([0, 0, 8], [0, 0, 8])
tensor(2.6386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 4.7670313506387174e-05), (499999, 0.0)]
([0, 0, 8], [0, 0, 8])
tensor(2.6386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 4.757713759317994e-05), (499999, 0.0)]
([0, 0, 8], [0, 0, 8])
tensor(2.6386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 0.00011215817357879132), (499999, 0.0)]
([0, 0, 8], [0, 0, 8])
tensor(2.6386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 0.00011217768769711256), (499999, 0.0)]
([0, 0, 8], [0, 0, 8])
tensor(2.6386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 0.0001

  1%|          | 6/725 [06:35<21:31:01, 107.74s/it]

test：1.0, test mean: 1.0
([1, 4, 2], [1, 4, 2])
tensor(16.7283, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 7/725 [06:59<16:04:17, 80.58s/it] 

([0, 6, 8], [0, 6, 8])
tensor(6.9148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -1.559780526960708e-09), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(6.9148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -2.0627382468774158e-08), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(6.9148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -7.272181392181665e-06), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(6.9148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.8079106161603704e-05), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(6.9148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.792193820117973e-05), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(6.9148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.794858275796287e-05), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(6.9148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.85895946237724

  1%|          | 8/725 [11:28<27:57:07, 140.35s/it]

test：1.0, test mean: 1.0
([7, 0, 8], [7, 0, 8])
tensor(11.2672, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|          | 9/725 [11:53<20:45:22, 104.36s/it]

([2, 1, 7], [2, 1, 7])
tensor(22.3697, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  1%|▏         | 10/725 [12:18<15:53:32, 80.02s/it]

([1, 3, 0], [1, 3, 0])
tensor(9.8903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 27.47570037841797), (499999, 0.0)]
([1, 1, 0], [1, 3, 0])
tensor(7.1687, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 131.29977416992188), (499999, 0.0)]
([1, 1, 0], [1, 3, 0])
tensor(7.7763, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 162.49122619628906), (499999, 0.0)]
([1, 0, 0], [1, 3, 0])
tensor(6.6497, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 177.58477783203125), (499999, 0.0)]
([1, 1, 0], [1, 3, 0])
tensor(6.7523, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 202.1812744140625), (499999, 0.0)]
([1, 0, 0], [1, 3, 0])
tensor(6.5691, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 211.4308319091797), (499999, 0.0)]
([1, 0, 0], [1, 3, 0])
tensor(6.4727, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (339965, 212.82421875), (499999, 0.0)]
(

  2%|▏         | 11/725 [16:45<27:11:31, 137.10s/it]

test：0.0, test mean: 0.6666666666666666
([2, 2, 5], [2, 2, 5])
tensor(25.7814, device='cuda:0', grad_fn=<NllLossBackward0>)


  2%|▏         | 12/725 [17:10<20:24:11, 103.02s/it]

[(499999, -0.009891888126730919), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [0, 1, 2])
tensor(15.0142, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 13/725 [17:34<15:38:31, 79.09s/it] 

([2, 1, 3], [2, 1, 3])
tensor(24.0406, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 14/725 [18:00<12:25:43, 62.93s/it]

([1, 5, 7], [1, 5, 7])
tensor(12.0734, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 2.9367947578430176)]
([1, 5, 7], [1, 5, 7])
tensor(9.7040, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 31.439226150512695)]
([1, 5, 0], [1, 5, 7])
tensor(8.7754, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 30.998497009277344)]
([1, 5, 0], [1, 5, 7])
tensor(8.7726, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 30.787837982177734)]
([1, 5, 0], [1, 5, 7])
tensor(8.7722, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 30.609228134155273)]
([1, 5, 0], [1, 5, 7])
tensor(8.7719, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 30.450138092041016)]
([1, 5, 0], [1, 5, 7])
tensor(8.7717, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 30.312871932

  2%|▏         | 15/725 [22:50<25:56:29, 131.53s/it]

test：0.0, test mean: 0.5
([0, 1, 1], [0, 1, 1])
tensor(10.9327, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 16/725 [23:17<19:40:20, 99.89s/it] 

([1, 7, 1], [1, 7, 1])
tensor(20.6123, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.7659730911254883), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(17.3773, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.652776718139648), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(14.4860, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 37.23838806152344), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(12.0077, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 37.23750305175781), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(12.0077, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 37.23661422729492), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(12.0077, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 37.2357292175293), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(12.0077, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 37.234840393066406), (4999

  2%|▏         | 17/725 [27:45<29:37:24, 150.63s/it]

test：0.0, test mean: 0.4
([1, 2, 1], [1, 2, 1])
tensor(19.5171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  2%|▏         | 18/725 [28:12<22:17:49, 113.54s/it]

([7, 1, 7], [7, 1, 7])
tensor(22.9119, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, -0.23285113275051117), (380927, 2.7659730911254883)]
([7, 1, 7], [7, 1, 7])
tensor(18.5046, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 25.125408172607422), (499999, -0.46570226550102234), (380927, 15.86917495727539)]
([7, 1, 7], [7, 1, 7])
tensor(12.6200, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.724098205566406), (499999, -0.6985534429550171), (380927, 96.8498306274414)]
([0, 1, 7], [7, 1, 7])
tensor(8.1084, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.432979583740234), (499999, -0.9314045310020447), (380927, 100.23075103759766)]
([0, 1, 0], [7, 1, 7])
tensor(7.8680, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.374862670898438), (499999, -1.1642557382583618), (380927, 100.24552917480469)]
([0, 1, 0], [7, 1, 7])
tensor(7.8678, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.32892417907715), (499999, -

  3%|▎         | 19/725 [32:48<31:50:10, 162.34s/it]

test：0.0, test mean: 0.3333333333333333
([1, 8, 2], [1, 8, 2])
tensor(21.1465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 20/725 [33:13<23:43:07, 121.12s/it]

([2, 5, 8], [2, 5, 8])
tensor(15.4506, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  3%|▎         | 21/725 [33:40<18:06:46, 92.62s/it] 

([1, 5, 7], [1, 5, 7])
tensor(15.6168, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 2.7659692764282227)]
([1, 5, 7], [1, 5, 7])
tensor(13.2828, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 20.110248565673828)]
([1, 5, 0], [1, 5, 7])
tensor(9.7452, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 37.126922607421875)]
([1, 5, 0], [1, 5, 7])
tensor(9.5516, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 37.12690734863281)]
([1, 5, 0], [1, 5, 7])
tensor(9.5516, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 37.126888275146484)]
([1, 5, 0], [1, 5, 7])
tensor(9.5516, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 37.12687301635742)]
([1, 5, 0], [1, 5, 7])
tensor(9.5516, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 37.1268577575

  3%|▎         | 22/725 [38:39<30:11:30, 154.61s/it]

test：0.0, test mean: 0.2857142857142857
([5, 1, 3], [5, 1, 3])
tensor(15.1798, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, -17.18808937072754)]
([5, 1, 3], [5, 1, 3])
tensor(14.9171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, -50.226783752441406)]
([5, 1, 3], [5, 1, 3])
tensor(14.1296, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, -94.342041015625)]
([5, 1, 3], [5, 1, 3])
tensor(12.9660, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, -115.23350524902344)]
([5, 1, 3], [5, 1, 3])
tensor(11.8122, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, -126.21458435058594)]
([5, 1, 0], [5, 1, 3])
tensor(11.3594, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, -129.7852325439453)]
([5, 1, 0], [5, 1, 3])
tensor(11.3308, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499

  3%|▎         | 23/725 [44:00<39:54:50, 204.69s/it]

test：0.0, test mean: 0.25
([1, 7, 0], [1, 7, 0])
tensor(15.1927, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.5570147037506104), (442367, 6.9545558289974e-07)]
([1, 7, 0], [1, 7, 0])
tensor(15.1967, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -7.699244976043701), (442367, 1.9474529835861176e-05)]
([1, 7, 0], [1, 7, 0])
tensor(8.9976, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -51.16759490966797), (442367, 1.9492572391754948e-05)]
([1, 0, 0], [1, 7, 0])
tensor(7.4080, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -53.89543151855469), (442367, 1.8239083146909252e-05)]
([1, 3, 0], [1, 7, 0])
tensor(10.4089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 91.64692687988281), (442367, 1.8239916244056076e-05)]
([1, 0, 0], [1, 7, 0])
tensor(7.3650, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 91.53778839111328), (442367, 1.8240747522213496e-05

  3%|▎         | 24/725 [48:46<44:34:17, 228.90s/it]

test：0.5, test mean: 0.2777777777777778
([2, 7, 0], [2, 7, 0])
tensor(14.3957, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.918018341064453), (499999, 0.0)]
([2, 7, 0], [2, 7, 0])
tensor(11.2491, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 59.93922424316406), (499999, 0.0)]
([2, 3, 0], [2, 7, 0])
tensor(10.8148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 70.04353332519531), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(10.3188, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 69.91474914550781), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(10.3188, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 69.76886749267578), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(10.4373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 65.14796447753906), (499999, 0.0)]
([2, 0, 0], [2, 7, 0])
tensor(10.3189, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.

  3%|▎         | 25/725 [53:20<47:08:57, 242.48s/it]

test：0.0, test mean: 0.25
([0, 1, 2], [0, 1, 2])
tensor(13.5882, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 4.489850158329034e-10), (499999, 0.0), (499999, 0.0)]


  4%|▎         | 26/725 [53:46<34:29:27, 177.64s/it]

([8, 2, 0], [8, 2, 0])
tensor(18.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 9.258685622626217e-07)]
([8, 2, 0], [8, 2, 0])
tensor(18.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 7.545586868218379e-06)]
([8, 2, 0], [8, 2, 0])
tensor(18.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 7.547488166892435e-06)]
([8, 2, 0], [8, 2, 0])
tensor(18.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 7.399921287287725e-06)]
([8, 2, 0], [8, 2, 0])
tensor(18.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 7.659637958568055e-06)]
([8, 2, 0], [8, 2, 0])
tensor(18.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 7.660575647605583e-06)]
([8, 2, 0], [8, 2, 0])
tensor(18.0881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0

  4%|▎         | 27/725 [58:43<41:23:16, 213.46s/it]

test：1.0, test mean: 0.3181818181818182
([2, 2, 0], [2, 2, 0])
tensor(17.3447, device='cuda:0', grad_fn=<NllLossBackward0>)


  4%|▍         | 28/725 [59:09<30:26:48, 157.26s/it]

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([2, 8, 2], [2, 8, 2])
tensor(24.1511, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▍         | 29/725 [59:36<22:48:05, 117.94s/it]

([2, 2, 2], [2, 2, 2])
tensor(26.5593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  4%|▍         | 30/725 [1:00:02<17:29:12, 90.58s/it]

([3, 1, 6], [3, 1, 6])
tensor(12.4279, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -17.985015869140625), (499999, 0.0), (499999, 0.0)]
([3, 1, 6], [3, 1, 6])
tensor(11.7967, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -31.26449203491211), (499999, 0.0), (499999, 0.0)]
([3, 1, 6], [3, 1, 6])
tensor(10.4384, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -29.11562156677246), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [3, 1, 6])
tensor(8.5661, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -30.57427978515625), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [3, 1, 6])
tensor(8.4618, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -29.22955322265625), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [3, 1, 6])
tensor(8.4547, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -28.107131958007812), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [3, 1, 6])
tensor(8.4520, device='cuda:0', grad_fn=<NllLossBackward0>)
[(221183, -27.24020767211914), (499999, 0.0), (4

  4%|▍         | 31/725 [1:04:30<27:42:32, 143.74s/it]

test：0.0, test mean: 0.2916666666666667
([0, 2, 3], [0, 2, 7])
tensor(13.1556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -4.0096426179303535e-10), (499999, 0.0), (499999, 0.0)]
([0, 2, 3], [0, 2, 7])
tensor(13.1556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.0001153273624368012), (499999, 0.0), (499999, 0.0)]
([0, 2, 3], [0, 2, 7])
tensor(13.1556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.00011532764619914815), (499999, 0.0), (499999, 0.0)]
([0, 2, 3], [0, 2, 7])
tensor(13.1556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.0002138854906661436), (499999, 0.0), (499999, 0.0)]
([0, 2, 3], [0, 2, 7])
tensor(13.1556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.0002138934505637735), (499999, 0.0), (499999, 0.0)]
([0, 2, 3], [0, 2, 7])
tensor(13.1556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.00021390142501331866), (499999, 0.0), (499999, 0.0)]
([0, 2, 3], [0, 2, 7])
tensor(13.1556, device='cuda:0', grad_fn=<Nll

  4%|▍         | 32/725 [1:09:05<35:14:07, 183.04s/it]

test：1.0, test mean: 0.34615384615384615
([5, 2, 1], [5, 2, 1])
tensor(23.7427, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 33/725 [1:09:30<26:05:24, 135.73s/it]

([8, 2, 0], [8, 2, 0])
tensor(13.9327, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 34/725 [1:09:56<19:41:51, 102.62s/it]

([7, 8, 5], [7, 8, 5])
tensor(14.0240, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0065400805324316025)]
([7, 8, 5], [7, 8, 5])
tensor(11.7681, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 16.018497467041016), (499999, 0.0), (499999, 0.013080161064863205)]
([7, 8, 5], [7, 8, 5])
tensor(10.6318, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 9.549964904785156), (499999, 0.0), (499999, 0.019620247185230255)]
([0, 8, 5], [7, 8, 5])
tensor(7.2408, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 9.970547676086426), (499999, 0.0), (499999, 0.02616032212972641)]
([0, 8, 5], [7, 8, 5])
tensor(7.2389, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.417254447937012), (499999, 0.0), (499999, 0.03270040452480316)]
([0, 8, 5], [7, 8, 5])
tensor(7.2357, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.615399360656738), (499999, 0.0), (499999, 0.03924047201871872)]
([0, 8, 5], [7, 8, 5])
tensor(7.235

  5%|▍         | 35/725 [1:14:18<28:50:39, 150.49s/it]

test：0.0, test mean: 0.32142857142857145
([1, 1, 0], [1, 1, 0])
tensor(15.3077, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▍         | 36/725 [1:14:43<21:38:14, 113.05s/it]

([1, 7, 1], [1, 7, 1])
tensor(14.0354, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.014130987226963043)]


  5%|▌         | 37/725 [1:15:08<16:33:21, 86.63s/it] 

([2, 8, 5], [2, 8, 5])
tensor(22.3445, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -2.9430365562438965)]
([2, 8, 5], [2, 8, 5])
tensor(22.1438, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -19.289783477783203)]
([2, 8, 5], [2, 8, 5])
tensor(19.5593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -48.43898010253906)]
([2, 8, 1], [2, 8, 5])
tensor(17.8066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -64.71871185302734)]
([2, 8, 5], [2, 8, 5])
tensor(19.8675, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -97.58139038085938)]
([2, 8, 1], [2, 8, 5])
tensor(18.7021, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -151.1012420654297)]
([2, 8, 0], [2, 8, 5])
tensor(17.1627, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -151

  5%|▌         | 38/725 [1:19:49<27:39:04, 144.90s/it]

test：0.0, test mean: 0.3
([2, 2, 1], [2, 2, 1])
tensor(26.6708, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  5%|▌         | 39/725 [1:20:15<20:49:05, 109.25s/it]

([1, 6, 0], [1, 6, 0])
tensor(14.0249, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 40/725 [1:20:42<16:03:08, 84.36s/it] 

([7, 0, 1], [7, 0, 1])
tensor(14.2719, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 41/725 [1:21:05<12:34:11, 66.16s/it]

([1, 0, 1], [1, 0, 1])
tensor(13.0108, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.2017885148525238), (499999, 0.0), (499999, 0.0)]


  6%|▌         | 42/725 [1:21:29<10:06:41, 53.30s/it]

([2, 3, 1], [2, 3, 1])
tensor(21.6517, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -17.984970092773438), (499999, 0.0)]
([2, 3, 1], [2, 3, 1])
tensor(21.1581, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -52.257938385009766), (499999, 0.0)]
([2, 3, 1], [2, 3, 1])
tensor(21.0642, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -108.62635803222656), (499999, 0.0)]
([2, 3, 1], [2, 3, 1])
tensor(20.2168, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -137.9905548095703), (499999, 0.0)]
([2, 3, 1], [2, 3, 1])
tensor(19.0733, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -167.5242919921875), (499999, 0.0)]
([2, 3, 1], [2, 3, 1])
tensor(18.3487, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -195.49795532226562), (499999, 0.0)]
([2, 3, 1], [2, 3, 1])
tensor(17.7689, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -218.233428955078

  6%|▌         | 43/725 [1:26:01<22:32:38, 119.00s/it]

test：0.0, test mean: 0.28125
([2, 1, 7], [2, 1, 7])
tensor(18.7247, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.7659730911254883)]
([2, 1, 7], [2, 1, 7])
tensor(17.0070, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 14.173625946044922)]
([2, 1, 7], [2, 1, 7])
tensor(13.8128, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 140.9462127685547)]
([2, 1, 7], [2, 1, 7])
tensor(11.6824, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 140.14028930664062)]
([2, 1, 0], [2, 1, 7])
tensor(11.0024, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 140.66049194335938)]
([2, 1, 0], [2, 1, 7])
tensor(10.9958, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 140.9627685546875)]
([2, 1, 0], [2, 1, 7])
tensor(10.9944, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (49

  6%|▌         | 44/725 [1:31:02<32:50:36, 173.62s/it]

test：0.0, test mean: 0.2647058823529412
([1, 8, 7], [1, 8, 7])
tensor(19.8513, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.005024348385632038), (499999, 0.0), (380927, 2.9181621074676514)]
([1, 8, 7], [1, 8, 7])
tensor(17.9017, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.010048696771264076), (499999, 0.0), (380927, 16.938066482543945)]
([1, 8, 7], [1, 8, 7])
tensor(16.0640, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.015073046088218689), (499999, 0.0), (380927, 7.384821891784668)]
([1, 8, 0], [1, 8, 7])
tensor(13.8023, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.020097393542528152), (499999, 0.0), (380927, 7.361053466796875)]
([1, 8, 0], [1, 8, 7])
tensor(13.8023, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.025121744722127914), (499999, 0.0), (380927, 7.338243007659912)]
([1, 8, 0], [1, 8, 7])
tensor(13.8023, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.030146092176437378), (499999, 0.0), (380927, 7.3154315948

  6%|▌         | 45/725 [1:36:07<40:15:09, 213.10s/it]

test：0.0, test mean: 0.25
([7, 2, 2], [7, 2, 2])
tensor(26.6107, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.888364791870117), (499999, -0.009563574567437172), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(24.7044, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.569595336914062), (499999, -0.019127149134874344), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(20.6543, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 82.69209289550781), (499999, -0.02869071438908577), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(18.6541, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 81.06161499023438), (499999, -0.03825429826974869), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(18.6413, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 80.02379608154297), (499999, -0.04781787842512131), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(18.6383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 79.46879577636719), (499999, -0.05738144367933273), (499999, 0.0)]
([0, 

  6%|▋         | 46/725 [1:40:45<43:50:06, 232.41s/it]

test：0.0, test mean: 0.23684210526315788
([2, 1, 2], [2, 1, 2])
tensor(31.5802, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  6%|▋         | 47/725 [1:41:11<32:06:40, 170.50s/it]

([1, 3, 2], [1, 1, 2])
tensor(19.8852, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 48/725 [1:41:37<23:54:47, 127.16s/it]

([2, 1, 2], [2, 1, 2])
tensor(21.2859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 49/725 [1:42:01<18:05:02, 96.31s/it] 

([0, 7, 1], [0, 7, 1])
tensor(10.7789, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 50/725 [1:42:26<14:01:08, 74.77s/it]

([2, 0, 7], [2, 0, 7])
tensor(17.9737, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.5570147037506104)]
([2, 0, 7], [2, 0, 7])
tensor(17.9405, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -8.421751022338867)]
([2, 0, 7], [2, 0, 7])
tensor(11.4287, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -30.918014526367188)]
([2, 0, 0], [2, 0, 7])
tensor(9.8173, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -33.875457763671875)]
([2, 0, 3], [2, 0, 7])
tensor(11.8931, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 42.21426010131836)]
([2, 0, 0], [2, 0, 7])
tensor(9.8773, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 37.60335922241211)]
([2, 0, 3], [2, 0, 7])
tensor(15.3170, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 186.193

  7%|▋         | 51/725 [1:47:22<26:28:23, 141.40s/it]

test：0.0, test mean: 0.225
([8, 3, 2], [8, 3, 2])
tensor(24.0761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -17.19188117980957), (499999, 0.0)]
([8, 3, 2], [8, 3, 2])
tensor(23.7361, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -28.51622200012207), (499999, 0.0)]
([8, 3, 2], [8, 3, 2])
tensor(22.1971, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -35.810028076171875), (499999, 0.0)]
([8, 3, 2], [8, 3, 2])
tensor(20.7467, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -32.61133575439453), (499999, 0.0)]
([8, 3, 2], [8, 3, 2])
tensor(19.5178, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -44.102699279785156), (499999, 0.0)]
([8, 0, 2], [8, 3, 2])
tensor(19.1262, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -30.834182739257812), (499999, 0.0)]
([8, 0, 2], [8, 3, 2])
tensor(19.0310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), 

  7%|▋         | 52/725 [1:52:04<34:18:58, 183.56s/it]

test：0.0, test mean: 0.21428571428571427
([2, 2, 8], [2, 2, 8])
tensor(18.7305, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -13.131996154785156)]
([2, 2, 8], [2, 2, 8])
tensor(19.0418, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -40.73749542236328)]
([2, 2, 8], [2, 2, 8])
tensor(19.0294, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -69.66166687011719)]
([2, 2, 8], [2, 2, 8])
tensor(18.7921, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -92.01939392089844)]
([2, 2, 8], [2, 2, 8])
tensor(18.3507, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -121.62711334228516)]
([2, 2, 8], [2, 2, 8])
tensor(17.5860, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -146.96592712402344)]
([2, 2, 8], [2, 2, 8])
tensor(17.0037, device='cuda:0', grad_fn=<NllLossBackward0>)
[(

  7%|▋         | 53/725 [1:56:45<39:42:20, 212.71s/it]

test：0.0, test mean: 0.20454545454545456
([1, 8, 1], [1, 8, 1])
tensor(15.0499, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  7%|▋         | 54/725 [1:57:11<29:13:25, 156.79s/it]

([1, 1, 2], [1, 1, 2])
tensor(23.9942, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 55/725 [1:57:40<22:01:36, 118.35s/it]

([2, 0, 0], [2, 0, 0])
tensor(11.3951, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 56/725 [1:58:06<16:50:19, 90.61s/it] 

([6, 2, 0], [6, 2, 0])
tensor(10.6205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.010905302858191e-08)]
([6, 2, 0], [6, 2, 0])
tensor(10.6205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.7699578336305422e-07)]
([6, 2, 0], [6, 2, 0])
tensor(10.6205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -2.6937699658446945e-05)]
([6, 2, 0], [6, 2, 0])
tensor(10.6205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -3.237190321669914e-05)]
([6, 2, 0], [6, 2, 0])
tensor(10.6205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -4.6271015889942646e-05)]
([6, 2, 0], [6, 2, 0])
tensor(10.6205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -4.628152964869514e-05)]
([6, 2, 0], [6, 2, 0])
tensor(10.6205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499

  8%|▊         | 57/725 [2:03:07<28:31:10, 153.70s/it]

test：1.0, test mean: 0.2391304347826087
([7, 2, 7], [7, 2, 7])
tensor(20.8719, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.9423108100891113), (499999, 0.0), (380927, 2.9181621074676514)]
([7, 2, 7], [7, 2, 7])
tensor(16.4925, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 23.331283569335938), (499999, 0.0), (380927, 21.694997787475586)]
([7, 2, 7], [7, 2, 7])
tensor(12.6470, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -1.583951473236084), (499999, 0.0), (380927, 13.64938735961914)]
([0, 2, 0], [7, 2, 7])
tensor(8.6866, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -1.5841243267059326), (499999, 0.0), (380927, 13.547920227050781)]
([0, 2, 0], [7, 2, 7])
tensor(8.6864, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -1.584296703338623), (499999, 0.0), (380927, 13.466558456420898)]
([0, 2, 0], [7, 2, 7])
tensor(8.6863, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -1.5844669342041016), (499999, 0.0), (380927, 13.405599594116211)]
([

  8%|▊         | 58/725 [2:08:07<36:36:01, 197.54s/it]

test：0.0, test mean: 0.22916666666666666
([2, 5, 1], [2, 5, 1])
tensor(29.1208, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.015038280747830868), (499999, 0.0)]


  8%|▊         | 59/725 [2:08:30<26:52:07, 145.24s/it]

([1, 0, 1], [1, 0, 1])
tensor(15.6475, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 60/725 [2:08:57<20:18:40, 109.96s/it]

([1, 8, 2], [1, 8, 2])
tensor(21.8264, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  8%|▊         | 61/725 [2:09:23<15:36:10, 84.59s/it] 

([2, 2, 2], [2, 2, 2])
tensor(23.0042, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▊         | 62/725 [2:09:49<12:21:45, 67.13s/it]

([2, 2, 2], [2, 2, 2])
tensor(28.8332, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▊         | 63/725 [2:10:16<10:06:59, 55.01s/it]

([1, 1, 8], [1, 1, 8])
tensor(17.8248, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 64/725 [2:10:43<8:33:29, 46.61s/it] 

([8, 0, 8], [8, 0, 8])
tensor(11.5333, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 65/725 [2:11:11<7:30:10, 40.93s/it]

([5, 2, 1], [5, 2, 1])
tensor(18.4402, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 66/725 [2:11:37<6:42:15, 36.62s/it]

([8, 0, 2], [4, 0, 2])
tensor(7.9984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 67/725 [2:12:04<6:08:02, 33.56s/it]

([8, 2, 8], [8, 2, 8])
tensor(17.8924, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


  9%|▉         | 68/725 [2:12:30<5:45:04, 31.51s/it]

([8, 3, 1], [8, 3, 1])
tensor(13.8270, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|▉         | 69/725 [2:12:57<5:29:13, 30.11s/it]

([2, 2, 1], [2, 2, 1])
tensor(28.8810, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|▉         | 70/725 [2:13:24<5:17:08, 29.05s/it]

([2, 8, 2], [2, 8, 2])
tensor(18.4671, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0012840430717915297), (499999, 0.008525494486093521), (499999, 0.0)]


 10%|▉         | 71/725 [2:13:48<4:59:08, 27.44s/it]

([1, 1, 6], [1, 1, 6])
tensor(15.0370, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|▉         | 72/725 [2:14:13<4:53:00, 26.92s/it]

([2, 2, 1], [2, 2, 1])
tensor(28.1738, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.011318929493427277), (499999, 0.0), (499999, 0.0)]


 10%|█         | 73/725 [2:14:39<4:49:44, 26.66s/it]

([2, 8, 0], [2, 8, 0])
tensor(7.4555, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -11.809257507324219), (499999, 0.0)]
([2, 8, 0], [2, 8, 0])
tensor(7.7814, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -20.774173736572266), (499999, 0.0)]
([2, 8, 0], [2, 8, 0])
tensor(7.6980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -34.729347229003906), (499999, 0.0)]
([2, 8, 0], [2, 8, 0])
tensor(7.0218, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -45.35823059082031), (499999, 0.0)]
([2, 0, 0], [2, 8, 0])
tensor(6.7220, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -51.3652458190918), (499999, 0.0)]
([2, 0, 0], [2, 8, 0])
tensor(6.6186, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -54.071434020996094), (499999, 0.0)]
([2, 0, 0], [2, 8, 0])
tensor(6.5958, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -54.313934326171875), (49

 10%|█         | 74/725 [2:19:22<18:41:24, 103.35s/it]

test：0.0, test mean: 0.22
([2, 7, 1], [2, 7, 1])
tensor(14.6106, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 10%|█         | 75/725 [2:19:50<14:37:37, 81.01s/it] 

([7, 6, 1], [7, 6, 1])
tensor(18.3708, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 6, 1], [7, 6, 1])
tensor(14.1239, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.336992263793945), (499999, 0.0), (499999, 0.0)]
([7, 6, 1], [7, 6, 1])
tensor(13.8257, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.740046501159668), (499999, 0.0), (499999, 0.0)]
([0, 6, 1], [7, 6, 1])
tensor(9.9292, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.107583999633789), (499999, 0.0), (499999, 0.0)]
([0, 6, 1], [7, 6, 1])
tensor(9.8755, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.106623649597168), (499999, 0.0), (499999, 0.0)]
([0, 6, 1], [7, 6, 1])
tensor(9.8755, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.105664253234863), (499999, 0.0), (499999, 0.0)]
([0, 6, 1], [7, 6, 1])
tensor(9.8755, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.104704856872559), (499999, 0.0), (499

 10%|█         | 76/725 [2:24:40<25:52:02, 143.49s/it]

test：0.0, test mean: 0.21153846153846154
([2, 2, 2], [2, 2, 2])
tensor(26.6154, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0014230635715648532), (499999, 0.0), (499999, 0.0)]


 11%|█         | 77/725 [2:25:08<19:35:42, 108.86s/it]

([5, 1, 8], [5, 1, 8])
tensor(15.0869, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 78/725 [2:25:39<15:21:48, 85.49s/it] 

([8, 0, 0], [8, 0, 0])
tensor(1.4738, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 79/725 [2:26:06<12:12:15, 68.01s/it]

([2, 2, 2], [2, 2, 2])
tensor(24.9477, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 80/725 [2:26:32<9:55:39, 55.41s/it] 

([2, 1, 6], [2, 1, 6])
tensor(20.7948, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█         | 81/725 [2:26:59<8:22:12, 46.79s/it]

([2, 5, 3], [2, 5, 3])
tensor(21.0023, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.03132149577140808), (499999, 0.0), (499999, 0.0)]


 11%|█▏        | 82/725 [2:27:27<7:21:42, 41.22s/it]

([1, 3, 1], [1, 4, 1])
tensor(13.2680, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 11%|█▏        | 83/725 [2:27:54<6:36:56, 37.10s/it]

([0, 5, 3], [0, 5, 3])
tensor(9.3142, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.0014023378025740385), (499999, 0.0), (499999, 0.0)]
([0, 5, 3], [0, 5, 3])
tensor(9.3142, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.012996383011341095), (499999, 0.0), (499999, 0.0)]
([0, 5, 3], [0, 5, 3])
tensor(9.3140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.012641152366995811), (499999, 0.0), (499999, 0.0)]
([0, 5, 3], [0, 5, 3])
tensor(9.3140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.012494166381657124), (499999, 0.0), (499999, 0.0)]
([0, 5, 3], [0, 5, 3])
tensor(9.3140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.010439479723572731), (499999, 0.0), (499999, 0.0)]
([0, 5, 3], [0, 5, 3])
tensor(9.3140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.009955784305930138), (499999, 0.0), (499999, 0.0)]
([0, 5, 3], [0, 5, 3])
tensor(9.3140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.009940780699253082), (499999

 12%|█▏        | 84/725 [2:32:32<19:26:45, 109.21s/it]

test：1.0, test mean: 0.24074074074074073
([1, 3, 8], [1, 3, 8])
tensor(19.2318, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.03419571369886398), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 85/725 [2:32:59<15:00:55, 84.46s/it] 

([0, 0, 8], [0, 0, 8])
tensor(6.7943, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 86/725 [2:33:26<11:55:42, 67.20s/it]

([7, 1, 8], [7, 1, 8])
tensor(16.1786, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9369421005249023), (499999, 0.0), (499999, 0.0)]
([7, 1, 8], [7, 1, 8])
tensor(14.1652, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 24.783546447753906), (499999, 0.0), (499999, 0.0)]
([7, 1, 8], [7, 1, 8])
tensor(12.5886, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.136608123779297), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(9.9617, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.603076934814453), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(9.9601, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.7265625), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(9.9600, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.80560874938965), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(9.9600, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.88465118408203), (499999, 0.0), (499999, 0.0)]

 12%|█▏        | 87/725 [2:38:05<23:10:45, 130.79s/it]

test：0.0, test mean: 0.23214285714285715
([5, 5, 2], [5, 5, 2])
tensor(12.9413, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 88/725 [2:38:32<17:40:01, 99.85s/it] 

([5, 0, 8], [8, 0, 8])
tensor(7.6307, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 12%|█▏        | 89/725 [2:39:02<13:54:22, 78.71s/it]

([3, 7, 1], [3, 7, 1])
tensor(12.1135, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9423112869262695), (499999, 0.0)]
([3, 7, 1], [3, 7, 1])
tensor(9.3409, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 19.13813018798828), (499999, 0.0)]
([3, 7, 1], [3, 7, 1])
tensor(7.7588, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.012642860412597656), (499999, 0.0)]
([3, 0, 1], [3, 7, 1])
tensor(4.1564, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.420969009399414), (499999, 0.0)]
([3, 0, 1], [3, 7, 1])
tensor(4.1543, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.485154151916504), (499999, 0.0)]
([3, 0, 1], [3, 7, 1])
tensor(4.1527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.3442649841308594), (499999, 0.0)]
([3, 0, 1], [3, 7, 1])
tensor(4.1513, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.297206401824951), (499

 12%|█▏        | 90/725 [2:44:20<26:33:50, 150.60s/it]

test：0.0, test mean: 0.22413793103448276
([0, 7, 0], [0, 7, 0])
tensor(3.3645, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -6.786143057979643e-05), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 91/725 [2:44:51<20:12:42, 114.77s/it]

([1, 6, 2], [1, 6, 2])
tensor(10.0227, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 92/725 [2:45:23<15:48:37, 89.92s/it] 

([2, 0, 0], [2, 0, 0])
tensor(8.8190, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 1.4138940059638117e-05), (442367, -4.4703814637614414e-05)]
([2, 0, 0], [2, 0, 0])
tensor(8.8213, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00011973394430242479), (442367, 0.37103715538978577)]
([2, 0, 0], [2, 0, 0])
tensor(8.8190, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00011976120003964752), (442367, 0.3710256814956665)]
([2, 0, 0], [2, 0, 0])
tensor(8.8190, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00011978845577687025), (442367, 0.37099605798721313)]
([2, 0, 0], [2, 0, 0])
tensor(8.8190, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00011981572606600821), (442367, 0.37100276350975037)]
([2, 0, 0], [2, 0, 0])
tensor(8.8190, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00011984297452727333), (442367, 0.3710010051727295)]
([2, 0, 0], [2

 13%|█▎        | 93/725 [2:51:02<28:53:11, 164.54s/it]

test：1.0, test mean: 0.25
([2, 1, 2], [2, 1, 2])
tensor(22.3356, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 94/725 [2:51:33<21:48:52, 124.46s/it]

([1, 7, 5], [1, 7, 5])
tensor(12.2090, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([1, 7, 5], [1, 7, 5])
tensor(8.8312, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 9.438080787658691), (499999, 0.0)]
([1, 7, 5], [1, 7, 5])
tensor(7.5478, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.744040012359619), (499999, 0.0)]
([1, 0, 5], [1, 7, 5])
tensor(3.2240, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.7060465812683105), (499999, 0.0)]
([1, 0, 5], [1, 7, 5])
tensor(3.2238, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.6711626052856445), (499999, 0.0)]
([1, 0, 5], [1, 7, 5])
tensor(3.2237, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.63694429397583), (499999, 0.0)]
([1, 0, 5], [1, 7, 5])
tensor(3.2234, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.615561008453369), (49999

 13%|█▎        | 95/725 [2:56:48<31:47:29, 181.67s/it]

test：0.0, test mean: 0.24193548387096775
([1, 1, 1], [1, 1, 1])
tensor(18.9405, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.03672375530004501), (499999, 0.0)]


 13%|█▎        | 96/725 [2:57:19<23:50:52, 136.49s/it]

([8, 2, 8], [8, 2, 8])
tensor(12.3169, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 13%|█▎        | 97/725 [2:57:52<18:22:41, 105.35s/it]

([2, 0, 5], [2, 0, 5])
tensor(14.1046, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 14%|█▎        | 98/725 [2:58:20<14:19:28, 82.25s/it] 

([7, 2, 1], [7, 2, 1])
tensor(20.2965, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.712027072906494), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [7, 2, 1])
tensor(18.6252, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.772071838378906), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(12.4471, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.18368911743164), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(12.4442, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.202890396118164), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(12.4440, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.210476875305176), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(12.4439, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.21407699584961), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(12.4439, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.217721939086914), (499999, 0.0), (49

 14%|█▎        | 99/725 [3:03:32<26:16:40, 151.12s/it]

test：0.0, test mean: 0.234375
([1, 2, 1], [1, 2, 1])
tensor(21.1443, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 14%|█▍        | 100/725 [3:04:01<19:51:41, 114.40s/it]

([7, 2, 1], [7, 2, 1])
tensor(23.4981, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [7, 2, 1])
tensor(21.7486, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 23.832683563232422), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [7, 2, 1])
tensor(19.7836, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.312124252319336), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(17.4261, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.797538757324219), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(17.4253, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.549787521362305), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(17.4245, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.375982284545898), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(17.4241, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.222674369812012), (499999, 0.0), 

 14%|█▍        | 101/725 [3:09:42<31:37:10, 182.42s/it]

test：0.0, test mean: 0.22727272727272727
([8, 7, 0], [8, 7, 0])
tensor(11.4497, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 3.1132798194885254), (499999, 0.0)]
([8, 7, 0], [8, 7, 0])
tensor(9.8757, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.953174591064453), (499999, 0.0)]
([8, 7, 0], [8, 7, 0])
tensor(8.6054, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.10911917686462402), (499999, 0.0)]
([8, 0, 0], [8, 7, 0])
tensor(5.4071, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -5.30255126953125), (499999, 0.0)]
([8, 0, 0], [8, 7, 0])
tensor(5.3480, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.897642135620117), (499999, 0.0)]
([8, 0, 0], [8, 7, 0])
tensor(5.3434, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -4.725063323974609), (499999, 0.0)]
([8, 0, 0], [8, 7, 0])
tensor(5.3423, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0

 14%|█▍        | 102/725 [3:15:26<39:59:52, 231.13s/it]

test：0.0, test mean: 0.22058823529411764
([1, 6, 5], [1, 6, 5])
tensor(18.7489, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 14%|█▍        | 103/725 [3:16:00<29:40:01, 171.71s/it]

([8, 8, 0], [8, 8, 0])
tensor(4.3794, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (196607, -11.742866516113281), (499999, -0.00025623830151744187)]
([8, 8, 0], [8, 8, 0])
tensor(4.9454, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (196607, -48.49705505371094), (499999, -0.0005124766030348837)]
([8, 8, 0], [8, 8, 0])
tensor(4.8489, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (196607, -84.62600708007812), (499999, -0.0007687148754484951)]
([8, 8, 0], [8, 8, 0])
tensor(4.8463, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (196607, -119.47224426269531), (499999, -0.0010249532060697675)]
([8, 8, 0], [8, 8, 0])
tensor(4.8023, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (196607, -147.5738067626953), (499999, -0.001281191362068057)]
([8, 8, 0], [8, 8, 0])
tensor(4.8698, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (196607, -180.60130310058594), (499999, -0.0015374296344816685)]
([8, 8, 0], [8, 8, 

 14%|█▍        | 104/725 [3:21:26<37:37:01, 218.07s/it]

test：1.0, test mean: 0.24285714285714285
([5, 8, 1], [5, 8, 1])
tensor(14.7870, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.09475427865982056)]


 14%|█▍        | 105/725 [3:21:54<27:44:00, 161.03s/it]

([1, 1, 1], [1, 1, 1])
tensor(21.2653, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▍        | 106/725 [3:22:23<20:54:40, 121.62s/it]

([0, 2, 8], [0, 2, 8])
tensor(7.3898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.1902176235453226e-06), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [0, 2, 8])
tensor(7.3898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.2344163375673816e-06), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [0, 2, 8])
tensor(7.3898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.231297225487651e-06), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [0, 2, 8])
tensor(7.3898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.228503712511156e-06), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [0, 2, 8])
tensor(7.3898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.225709744787309e-06), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [0, 2, 8])
tensor(7.3898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.222915777063463e-06), (499999, 0.0), (499999, 0.0)]
([0, 2, 8], [0, 2, 8])
tensor(7.3898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 4.2201218093396164e-06)

 15%|█▍        | 107/725 [3:28:19<32:55:48, 191.83s/it]

test：1.0, test mean: 0.2638888888888889
([8, 2, 1], [8, 2, 1])
tensor(18.8800, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.000448455975856632)]


 15%|█▍        | 108/725 [3:28:54<24:50:15, 144.92s/it]

([0, 2, 8], [0, 2, 8])
tensor(10.8810, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▌        | 109/725 [3:29:26<18:59:40, 111.01s/it]

([2, 1, 0], [2, 1, 0])
tensor(14.9307, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.004743475466966629), (499999, 0.0)]


 15%|█▌        | 110/725 [3:29:58<14:53:15, 87.15s/it] 

([3, 2, 5], [3, 2, 5])
tensor(21.8601, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 15%|█▌        | 111/725 [3:30:27<11:54:48, 69.85s/it]

([8, 1, 3], [8, 1, 3])
tensor(16.2532, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (393215, -17.17266845703125)]
([8, 1, 3], [8, 1, 3])
tensor(15.8661, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (393215, -27.599117279052734)]
([8, 1, 3], [8, 1, 3])
tensor(14.2402, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (393215, -23.652767181396484)]
([8, 1, 0], [8, 1, 3])
tensor(12.2517, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (393215, -16.392650604248047)]
([8, 1, 0], [8, 1, 3])
tensor(12.1034, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (393215, -15.95133113861084)]
([8, 1, 0], [8, 1, 3])
tensor(12.0859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (393215, -15.925272941589355)]
([8, 1, 0], [8, 1, 3])
tensor(12.0857, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (393215, -1

 15%|█▌        | 112/725 [3:36:28<26:46:23, 157.23s/it]

test：0.0, test mean: 0.25675675675675674
([2, 0, 0], [2, 0, 0])
tensor(6.9593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -1.774283646227559e-06), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(6.9593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -3.5485675198287936e-06), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(6.9593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -5.322850938682677e-06), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(6.9593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -7.097134584910236e-06), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(6.9593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -8.871418685885146e-06), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(6.9593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -1.0645701877365354e-05), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(6.9593, device='cuda:0', grad_fn=<NllLo

 16%|█▌        | 113/725 [3:41:38<34:31:23, 203.08s/it]

test：1.0, test mean: 0.27631578947368424
([1, 7, 0], [1, 7, 0])
tensor(14.9902, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([1, 7, 0], [1, 7, 0])
tensor(10.3587, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 14.812799453735352), (499999, 0.0)]
([1, 7, 0], [1, 7, 0])
tensor(9.9612, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.3111248016357422), (499999, 0.0)]
([1, 0, 0], [1, 7, 0])
tensor(5.9428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.2473297119140625), (499999, 0.0)]
([1, 0, 0], [1, 7, 0])
tensor(5.9428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.1835365295410156), (499999, 0.0)]
([1, 0, 0], [1, 7, 0])
tensor(5.9428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -1.1197452545166016), (499999, 0.0)]
([1, 0, 0], [1, 7, 0])
tensor(5.9428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(4999

 16%|█▌        | 114/725 [3:47:01<40:32:39, 238.89s/it]

test：0.0, test mean: 0.2692307692307692
([0, 0, 7], [0, 0, 7])
tensor(6.4550, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.0015628230758011341), (499999, 0.0), (368639, 2.765970230102539)]
([0, 0, 7], [0, 0, 7])
tensor(3.8166, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.0023063686676323414), (499999, 0.0), (368639, 25.49026107788086)]
([0, 0, 7], [0, 0, 7])
tensor(0.9012, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 0.0005535269738174975), (499999, 0.0), (368639, 13.641780853271484)]
([0, 0, 0], [0, 0, 7])
tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.003516268217936158), (499999, 0.0), (368639, 13.64180850982666)]
([0, 0, 0], [0, 0, 7])
tensor(3.2900e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.003535897471010685), (499999, 0.0), (368639, 13.641840934753418)]
([0, 0, 0], [0, 0, 7])
tensor(3.2781e-05, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.003585154889151454), (499999, 0.0), (368639, 13

 16%|█▌        | 115/725 [3:53:07<46:56:11, 277.00s/it]

test：0.5, test mean: 0.275
([2, 2, 2], [2, 2, 2])
tensor(33.3591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 16%|█▌        | 116/725 [3:53:38<34:23:45, 203.33s/it]

([5, 0, 1], [5, 0, 1])
tensor(11.0159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.0544648887389485e-07), (499999, 0.0)]
([5, 0, 1], [5, 0, 1])
tensor(11.0159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -8.940119045064421e-08), (499999, 0.0)]
([5, 0, 1], [5, 0, 1])
tensor(11.0159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.0015124499332159758), (499999, 0.0)]
([5, 0, 1], [5, 0, 1])
tensor(11.0159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.0015116288559511304), (499999, 0.0)]
([5, 0, 1], [5, 0, 1])
tensor(11.0159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.001512010581791401), (499999, 0.0)]
([5, 0, 1], [5, 0, 1])
tensor(11.1181, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -12.088617324829102), (499999, 0.0)]
([5, 0, 1], [5, 0, 1])
tensor(11.0159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -

 16%|█▌        | 117/725 [3:59:00<40:19:26, 238.76s/it]

test：1.0, test mean: 0.2926829268292683
([1, 1, 0], [1, 1, 0])
tensor(12.0331, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 16%|█▋        | 118/725 [3:59:31<29:45:29, 176.49s/it]

([3, 1, 7], [3, 1, 7])
tensor(18.6852, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, 22.692888259887695), (499999, 0.0), (380927, 2.9423112869262695)]
([1, 1, 7], [3, 1, 7])
tensor(13.7643, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, 162.9517822265625), (499999, 0.0), (380927, 12.906265258789062)]
([1, 1, 7], [3, 1, 7])
tensor(11.5295, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, 185.9044647216797), (499999, 0.0), (380927, 9.478619575500488)]
([0, 1, 0], [3, 1, 7])
tensor(7.0064, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, 202.51060485839844), (499999, 0.0), (380927, 14.23872184753418)]
([0, 1, 0], [3, 1, 7])
tensor(6.8484, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, 200.88211059570312), (499999, 0.0), (380927, 14.116445541381836)]
([0, 1, 0], [3, 1, 7])
tensor(6.8117, device='cuda:0', grad_fn=<NllLossBackward0>)
[(341156, 207.41323852539062), (499999, 0.0), (380927, 13.994169235229492)]
([0, 1, 0], [3, 1, 7])
tensor(6.9494, device='c

 16%|█▋        | 119/725 [4:05:19<38:22:31, 227.97s/it]

test：0.0, test mean: 0.2857142857142857
([2, 0, 5], [2, 0, 5])
tensor(8.8614, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 120/725 [4:05:50<28:22:20, 168.83s/it]

([2, 1, 8], [2, 1, 8])
tensor(21.9907, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 121/725 [4:06:22<21:26:02, 127.75s/it]

([0, 3, 2], [0, 3, 2])
tensor(11.1569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, 3.588781055441359e-06), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [0, 3, 2])
tensor(11.1569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, 6.849218152638059e-06), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [0, 3, 2])
tensor(11.1569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -0.0005705429357476532), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [0, 3, 2])
tensor(11.1569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -0.0005705614457838237), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [0, 3, 2])
tensor(11.1569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -0.0005438667722046375), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [0, 3, 2])
tensor(11.1569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -0.0005438679363578558), (499999, 0.0), (499999, 0.0)]
([0, 3, 2], [0, 3, 2])
tensor(11.1569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -0.00054320815

 17%|█▋        | 122/725 [4:11:31<30:32:26, 182.33s/it]

test：1.0, test mean: 0.3023255813953488
([8, 1, 2], [8, 1, 2])
tensor(21.6742, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 123/725 [4:12:00<22:46:59, 136.25s/it]

([0, 7, 1], [0, 7, 1])
tensor(7.3529, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 124/725 [4:12:28<17:19:50, 103.81s/it]

([1, 1, 1], [1, 1, 1])
tensor(17.3634, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 125/725 [4:12:56<13:30:24, 81.04s/it] 

([2, 3, 2], [2, 3, 2])
tensor(22.3482, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 17%|█▋        | 126/725 [4:13:23<10:48:05, 64.92s/it]

([2, 5, 1], [2, 2, 1])
tensor(18.5430, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 18%|█▊        | 127/725 [4:13:52<8:57:18, 53.91s/it] 

([0, 2, 2], [0, 2, 2])
tensor(18.2899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 4.409633675095392e-06), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(18.2899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.00038218835834413767), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(18.2899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.0003822590224444866), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(18.2899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.00038306511123664677), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(18.2899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.00038316112477332354), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(18.2899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.0003830691275652498), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(18.2899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.0003830515197

 18%|█▊        | 128/725 [4:19:03<21:43:12, 130.98s/it]

test：1.0, test mean: 0.3181818181818182
([7, 2, 0], [7, 2, 0])
tensor(13.2938, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.693246841430664), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(11.4362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.65737533569336), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(9.7263, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 27.41108512878418), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(6.5351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 27.411447525024414), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(6.5351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 27.41180419921875), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(6.5351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 27.41216278076172), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(6.5351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 27.412

 18%|█▊        | 129/725 [4:24:04<30:07:41, 181.98s/it]

test：0.0, test mean: 0.3111111111111111
([2, 8, 0], [2, 8, 7])
tensor(11.1437, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 18%|█▊        | 130/725 [4:24:32<22:28:13, 135.96s/it]

([2, 7, 2], [2, 7, 2])
tensor(24.3109, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9369421005249023), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(22.6459, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 26.64383316040039), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(19.8820, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.7859315872192383), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(17.9716, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.7753267288208008), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(17.9716, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.7647218704223633), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(17.9716, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.754115104675293), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(17.9715, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.7363252639770508), (4

 18%|█▊        | 131/725 [4:29:37<30:48:00, 186.67s/it]

test：0.0, test mean: 0.30434782608695654
([2, 8, 1], [2, 8, 1])
tensor(17.9341, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 18%|█▊        | 132/725 [4:30:03<22:47:13, 138.34s/it]

([2, 2, 1], [2, 2, 1])
tensor(22.0681, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 18%|█▊        | 133/725 [4:30:29<17:13:27, 104.74s/it]

([7, 3, 1], [7, 3, 1])
tensor(18.3982, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.7333524227142334), (499999, 0.0), (499999, 0.0)]
([7, 3, 1], [7, 3, 1])
tensor(18.3998, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -1.0796279907226562), (499999, 0.0), (499999, 0.0)]
([7, 3, 1], [7, 3, 1])
tensor(18.3942, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -8.504205703735352), (499999, 0.0), (499999, 0.0)]
([7, 3, 1], [7, 3, 1])
tensor(12.2464, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -50.69944763183594), (499999, 0.0), (499999, 0.0)]
([7, 3, 1], [7, 3, 1])
tensor(10.7221, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -72.71253204345703), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [7, 3, 1])
tensor(10.1133, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -72.70892333984375), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [7, 3, 1])
tensor(10.1133, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -72.705322265625), (499999, 0.0), 

 18%|█▊        | 134/725 [4:35:26<26:40:00, 162.44s/it]

test：0.0, test mean: 0.2978723404255319
([8, 1, 3], [8, 1, 3])
tensor(12.2363, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 19%|█▊        | 135/725 [4:35:53<19:58:39, 121.90s/it]

([7, 5, 7], [7, 5, 7])
tensor(9.2252, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 19%|█▉        | 136/725 [4:36:20<15:15:47, 93.29s/it] 

([1, 1, 0], [1, 1, 5])
tensor(14.4190, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, 0.004185006953775883)]
([1, 1, 0], [1, 1, 5])
tensor(14.4184, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, 0.004246653523296118)]
([1, 1, 0], [1, 1, 5])
tensor(14.4184, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, 0.004263128619641066)]
([1, 1, 0], [1, 1, 5])
tensor(14.4184, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, 0.004261842463165522)]
([1, 1, 0], [1, 1, 5])
tensor(14.4184, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, 0.0042594666592776775)]
([1, 1, 0], [1, 1, 5])
tensor(14.4184, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (196607, 0.004260150715708733)]
([1, 1, 0], [1, 1, 5])
tensor(14.4184, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (1

 19%|█▉        | 137/725 [4:42:37<29:07:38, 178.33s/it]

test：0.0, test mean: 0.2916666666666667
([1, 0, 8], [1, 0, 8])
tensor(12.2264, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 19%|█▉        | 138/725 [4:43:06<21:48:00, 133.70s/it]

([5, 6, 1], [5, 6, 1])
tensor(9.8356, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 19%|█▉        | 139/725 [4:43:41<16:55:15, 103.95s/it]

([0, 0, 1], [0, 0, 1])
tensor(6.1999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 1.3072287401882932e-05), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 3.1032854167278856e-05), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 3.256499985582195e-05), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 3.256500349380076e-05), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 3.256529453210533e-05), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 3.221751103410497e-05), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 4.563911

 19%|█▉        | 140/725 [4:48:52<26:58:40, 166.02s/it]

test：1.0, test mean: 0.30612244897959184
([2, 8, 2], [2, 8, 2])
tensor(20.9324, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -13.085071563720703), (499999, 0.0)]
([2, 8, 2], [2, 8, 2])
tensor(21.2754, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -45.29656982421875), (499999, 0.0)]
([2, 8, 2], [2, 8, 2])
tensor(21.2699, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -78.81725311279297), (499999, 0.0)]
([2, 8, 2], [2, 8, 2])
tensor(20.9956, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -115.94689178466797), (499999, 0.0)]
([2, 8, 2], [2, 8, 2])
tensor(20.5325, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -141.92738342285156), (499999, 0.0)]
([2, 8, 2], [2, 8, 2])
tensor(19.3438, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -117.23828125), (499999, 0.0)]
([2, 0, 2], [2, 8, 2])
tensor(18.8914, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49999

 19%|█▉        | 141/725 [4:53:59<33:50:12, 208.58s/it]

test：0.0, test mean: 0.3
([8, 2, 2], [8, 2, 2])
tensor(29.0541, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 20%|█▉        | 142/725 [4:54:31<25:11:34, 155.56s/it]

([0, 2, 2], [0, 2, 2])
tensor(23.4414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -1.230888574355049e-05), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(23.4414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.003090463113039732), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(23.4414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.0030816737562417984), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(23.4414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.003632808569818735), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(23.4414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.0036321906372904778), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(23.4414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.00465855747461319), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [0, 2, 2])
tensor(23.4414, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.00465855747461319),

 20%|█▉        | 143/725 [4:59:27<31:56:33, 197.58s/it]

test：1.0, test mean: 0.3137254901960784
([1, 2, 5], [1, 2, 5])
tensor(18.5977, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.023188995197415352), (499999, 0.0), (499999, 0.0)]


 20%|█▉        | 144/725 [4:59:54<23:38:28, 146.49s/it]

([2, 1, 5], [2, 1, 5])
tensor(21.1789, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 20%|██        | 145/725 [5:00:22<17:51:41, 110.87s/it]

([7, 2, 2], [7, 2, 2])
tensor(27.5674, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.888364791870117), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(24.8046, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 22.67904281616211), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(22.7818, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.04331111907959), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(20.1074, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.042028427124023), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(20.1074, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.040849685668945), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(20.1074, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.03966999053955), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(20.1074, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.038494110107422), (499999, 0.0), (499

 20%|██        | 146/725 [5:05:18<26:45:35, 166.38s/it]

test：0.0, test mean: 0.3076923076923077
([2, 0, 5], [2, 0, 5])
tensor(11.7968, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 20%|██        | 147/725 [5:05:48<20:08:19, 125.43s/it]

([8, 1, 6], [8, 1, 6])
tensor(19.1541, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 20%|██        | 148/725 [5:06:16<15:24:52, 96.17s/it] 

([0, 2, 2], [7, 2, 2])
tensor(17.9979, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.012104276567697525), (499999, 0.0)]


 21%|██        | 149/725 [5:06:45<12:11:36, 76.21s/it]

([5, 1, 7], [8, 1, 7])
tensor(10.6596, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -7.355527877807617), (380927, 2.9423112869262695)]
([5, 1, 7], [8, 1, 7])
tensor(9.0561, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -15.68021297454834), (380927, 14.525495529174805)]
([5, 1, 7], [8, 1, 7])
tensor(7.1638, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -24.004901885986328), (380927, -0.23819780349731445)]
([5, 1, 0], [8, 1, 7])
tensor(3.1335, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -32.329586029052734), (380927, -23.621841430664062)]
([5, 1, 0], [8, 1, 7])
tensor(2.9052, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -40.654273986816406), (380927, -24.053955078125)]
([5, 1, 0], [8, 1, 7])
tensor(2.9052, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (331775, -48.97895812988281), (380927, -24.486072540283203)]
([5, 1, 0], [8, 1, 7])
tensor(2.9052, de

 21%|██        | 150/725 [5:12:12<24:11:26, 151.45s/it]

test：0.5, test mean: 0.3113207547169811
([8, 8, 7], [8, 8, 7])
tensor(14.2491, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 21%|██        | 151/725 [5:12:41<18:15:49, 114.55s/it]

([2, 2, 7], [2, 2, 7])
tensor(30.8936, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9181621074676514)]
([2, 2, 7], [2, 2, 7])
tensor(27.6029, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 18.755779266357422)]
([2, 2, 7], [2, 2, 7])
tensor(27.0323, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 22.666099548339844)]
([2, 2, 0], [2, 2, 7])
tensor(22.2199, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 26.96521759033203)]
([2, 2, 0], [2, 2, 7])
tensor(22.1383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 26.95024299621582)]
([2, 2, 0], [2, 2, 7])
tensor(22.1383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 26.935270309448242)]
([2, 2, 0], [2, 2, 7])
tensor(22.1383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 26.92029

 21%|██        | 152/725 [5:17:55<27:46:56, 174.55s/it]

test：0.0, test mean: 0.3055555555555556
([5, 2, 5], [5, 2, 5])
tensor(17.0603, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 21%|██        | 153/725 [5:18:24<20:47:30, 130.86s/it]

([0, 0, 7], [0, 0, 7])
tensor(3.4072, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 21%|██        | 154/725 [5:18:53<15:53:39, 100.21s/it]

([0, 0, 2], [0, 7, 2])
tensor(12.4384, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 21%|██▏       | 155/725 [5:19:20<12:23:43, 78.29s/it] 

([1, 2, 3], [1, 2, 1])
tensor(15.5933, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 22%|██▏       | 156/725 [5:19:47<9:57:41, 63.03s/it] 

([8, 8, 5], [8, 8, 5])
tensor(11.2148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -2.9422621726989746)]
([8, 8, 5], [8, 8, 5])
tensor(11.0221, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -20.10017204284668)]
([8, 8, 5], [8, 8, 5])
tensor(8.9219, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -39.96727752685547)]
([8, 8, 3], [8, 8, 5])
tensor(8.8159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -47.52914047241211)]
([8, 8, 0], [8, 8, 5])
tensor(8.5031, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -47.53117370605469)]
([8, 8, 0], [8, 8, 5])
tensor(8.5030, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -47.53295135498047)]
([8, 8, 0], [8, 8, 5])
tensor(8.5030, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -47.534095

 22%|██▏       | 157/725 [5:25:13<22:22:50, 141.85s/it]

test：0.0, test mean: 0.3
([8, 0, 1], [8, 1, 1])
tensor(6.8233, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 22%|██▏       | 158/725 [5:25:43<17:03:30, 108.31s/it]

([0, 3, 1], [0, 3, 1])
tensor(11.5651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.477436851222592e-07), (245759, -17.191835403442383), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.1209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.3552114726044238e-05), (245759, -30.91732406616211), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(8.9125, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.00011822403757832944), (245759, -27.532833099365234), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(7.5735, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.0001210189366247505), (245759, -42.94406509399414), (499999, 0.0)]
([0, 0, 1], [0, 3, 1])
tensor(7.5024, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.0001210286354762502), (245759, -44.540889739990234), (499999, 0.0)]
([0, 0, 1], [0, 3, 1])
tensor(7.3058, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.00012276163033675402), (245759, -44.575416564941406), (499999, 0.0)]
([0, 0, 1], [0

 22%|██▏       | 159/725 [5:30:39<25:52:42, 164.60s/it]

test：0.5, test mean: 0.30357142857142855
([0, 1, 1], [0, 1, 1])
tensor(12.4038, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00018274695321451873), (499999, 0.0), (499999, 0.0)]


 22%|██▏       | 160/725 [5:31:07<19:23:00, 123.51s/it]

([1, 5, 1], [1, 5, 1])
tensor(15.5006, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 22%|██▏       | 161/725 [5:31:34<14:49:26, 94.62s/it] 

([8, 8, 5], [8, 8, 5])
tensor(10.0086, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -13.106046676635742), (499999, 0.05951552093029022)]
([8, 8, 5], [8, 8, 5])
tensor(10.4444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -43.68770217895508), (499999, 0.11903104186058044)]
([8, 8, 5], [8, 8, 5])
tensor(10.4334, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -75.58883666992188), (499999, 0.17854659259319305)]
([8, 8, 5], [8, 8, 5])
tensor(10.1569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -107.7745361328125), (499999, 0.2380620837211609)]
([8, 8, 5], [8, 8, 5])
tensor(10.1139, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -140.00503540039062), (499999, 0.2975776195526123)]
([8, 8, 5], [8, 8, 5])
tensor(10.0272, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -168.03900146484375), (499999, 0.3570931553840637)]
([8, 8, 5], [8, 8, 5])
tensor(9.63

 22%|██▏       | 162/725 [5:36:33<24:23:03, 155.92s/it]

test：1.0, test mean: 0.3157894736842105
([2, 1, 1], [2, 1, 1])
tensor(23.0056, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 22%|██▏       | 163/725 [5:37:01<18:20:59, 117.54s/it]

([5, 2, 1], [5, 2, 1])
tensor(17.0208, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.004791960120201111), (499999, 0.0), (499999, 0.0)]


 23%|██▎       | 164/725 [5:37:26<13:59:23, 89.77s/it] 

([2, 5, 8], [2, 5, 8])
tensor(10.0967, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 23%|██▎       | 165/725 [5:37:51<10:56:57, 70.39s/it]

([2, 1, 1], [2, 1, 1])
tensor(24.7125, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 23%|██▎       | 166/725 [5:38:17<8:51:00, 57.00s/it] 

([7, 5, 1], [7, 5, 1])
tensor(17.5220, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9423112869262695), (499999, 0.0), (499999, 0.0)]
([7, 5, 1], [7, 5, 1])
tensor(15.6329, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.366283416748047), (499999, 0.0), (499999, 0.0)]
([7, 5, 1], [7, 5, 1])
tensor(13.8923, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.562100410461426), (499999, 0.0), (499999, 0.0)]
([7, 5, 1], [7, 5, 1])
tensor(9.7893, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -40.93817901611328), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(9.3451, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -41.56340789794922), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(9.3449, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -42.161102294921875), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(9.3447, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -42.704219818115234), (499999, 0.0), (4

 23%|██▎       | 167/725 [5:43:02<19:25:04, 125.28s/it]

test：0.0, test mean: 0.3103448275862069
([1, 8, 0], [1, 8, 0])
tensor(13.4589, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.10915221273899078), (499999, 0.0)]


 23%|██▎       | 168/725 [5:43:25<14:40:50, 94.88s/it] 

([6, 0, 0], [6, 0, 0])
tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -9.977334514132963e-08), (499999, -8.833850733935833e-05)]
([6, 0, 0], [6, 0, 0])
tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -1.9954669028265926e-07), (499999, -0.00017667701467871666)]
([6, 0, 0], [6, 0, 0])
tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -2.993200496348436e-07), (499999, -0.0002650155802257359)]
([6, 0, 0], [6, 0, 0])
tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -3.990933805653185e-07), (499999, -0.0003533540293574333)]
([6, 0, 0], [6, 0, 0])
tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -4.988667114957934e-07), (499999, -0.00044169259490445256)]
([6, 0, 0], [6, 0, 0])
tensor(0.7055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (417791, -5.986400992696872e-07), (499999, -0.00053003081120550

 23%|██▎       | 169/725 [5:48:12<23:30:47, 152.24s/it]

test：1.0, test mean: 0.3220338983050847
([2, 0, 1], [2, 0, 1])
tensor(15.1761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -8.224905014038086)]
([2, 0, 1], [2, 0, 1])
tensor(15.0757, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -15.773492813110352)]
([2, 0, 1], [2, 0, 1])
tensor(14.4282, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -50.0513916015625)]
([2, 0, 1], [2, 0, 1])
tensor(13.6392, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -83.0165023803711)]
([2, 0, 1], [2, 0, 1])
tensor(13.3496, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -118.76154327392578)]
([2, 0, 1], [2, 0, 1])
tensor(13.0603, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (147455, -154.44801330566406)]
([2, 0, 1], [2, 0, 1])
tensor(12.4362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499

 23%|██▎       | 170/725 [5:54:17<33:19:15, 216.14s/it]

test：0.0, test mean: 0.31666666666666665
([1, 0, 1], [1, 0, 1])
tensor(8.5168, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 24%|██▎       | 171/725 [5:54:43<24:28:43, 159.07s/it]

([1, 8, 8], [1, 8, 8])
tensor(14.8103, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 24%|██▎       | 172/725 [5:55:08<18:17:12, 119.05s/it]

([2, 1, 8], [2, 1, 8])
tensor(25.8042, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 24%|██▍       | 173/725 [5:55:34<13:56:52, 90.96s/it] 

([0, 6, 1], [0, 6, 1])
tensor(8.1806, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 24%|██▍       | 174/725 [5:56:00<10:55:59, 71.43s/it]

([1, 2, 1], [1, 2, 1])
tensor(14.0775, device='cuda:0', grad_fn=<NllLossBackward0>)


 24%|██▍       | 175/725 [5:56:27<8:52:38, 58.11s/it] 

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([7, 8, 6], [7, 7, 6])
tensor(10.7536, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9369421005249023), (499999, 0.0), (499999, 0.0)]
([7, 8, 6], [7, 7, 6])
tensor(7.4012, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 18.34682846069336), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [7, 7, 6])
tensor(4.4167, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.531918525695801), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [7, 7, 6])
tensor(4.2472, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.5317702293395996), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [7, 7, 6])
tensor(4.2472, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.5316219329833984), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [7, 7, 6])
tensor(4.2472, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.5314745903015137), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [7, 7, 6])
tensor(4.2472, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927

 24%|██▍       | 176/725 [6:01:21<19:40:16, 128.99s/it]

test：0.0, test mean: 0.3114754098360656
([2, 1, 1], [2, 1, 1])
tensor(18.6666, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 24%|██▍       | 177/725 [6:01:50<15:03:51, 98.96s/it] 

([6, 0, 0], [6, 0, 0])
tensor(3.8645, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 25%|██▍       | 178/725 [6:02:18<11:49:13, 77.80s/it]

([2, 8, 8], [2, 8, 5])
tensor(13.4864, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 25%|██▍       | 179/725 [6:02:47<9:33:22, 63.01s/it] 

([2, 2, 6], [2, 2, 6])
tensor(16.2886, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 25%|██▍       | 180/725 [6:03:17<8:01:35, 53.02s/it]

([0, 5, 7], [0, 5, 7])
tensor(8.2350, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9181621074676514)]
([0, 5, 7], [0, 5, 7])
tensor(6.3913, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 17.64664077758789)]
([0, 5, 7], [0, 5, 7])
tensor(4.6497, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.809601783752441)]
([0, 5, 0], [0, 5, 7])
tensor(2.3365, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.810975074768066)]
([0, 5, 0], [0, 5, 7])
tensor(2.3365, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.812347412109375)]
([0, 5, 0], [0, 5, 7])
tensor(2.3365, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.813716888427734)]
([0, 5, 0], [0, 5, 7])
tensor(2.3365, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.81509208679

 25%|██▍       | 181/725 [6:08:53<20:50:15, 137.90s/it]

test：0.0, test mean: 0.3064516129032258
([6, 2, 8], [6, 2, 8])
tensor(20.6337, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 25%|██▌       | 182/725 [6:09:22<15:52:17, 105.22s/it]

([3, 8, 5], [3, 8, 5])
tensor(15.5333, device='cuda:0', grad_fn=<NllLossBackward0>)
[(196607, -17.86614990234375), (499999, 0.0), (479231, -2.9431138038635254)]
([7, 8, 5], [3, 8, 5])
tensor(12.6374, device='cuda:0', grad_fn=<NllLossBackward0>)
[(196607, -44.191612243652344), (499999, 0.0), (479231, -37.175376892089844)]
([0, 8, 0], [3, 8, 5])
tensor(5.8385, device='cuda:0', grad_fn=<NllLossBackward0>)
[(196607, -44.19075012207031), (499999, 0.0), (479231, -42.24714660644531)]
([0, 8, 0], [3, 8, 5])
tensor(5.7089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(196607, -44.19034194946289), (499999, 0.0), (479231, -42.250244140625)]
([0, 8, 0], [3, 8, 5])
tensor(5.7088, device='cuda:0', grad_fn=<NllLossBackward0>)
[(196607, -44.18996810913086), (499999, 0.0), (479231, -42.25262451171875)]
([0, 8, 0], [3, 8, 5])
tensor(5.7088, device='cuda:0', grad_fn=<NllLossBackward0>)
[(196607, -44.18978500366211), (499999, 0.0), (479231, -42.2545051574707)]
([0, 8, 0], [3, 8, 5])
tensor(5.7087, device

 25%|██▌       | 183/725 [6:14:16<24:22:06, 161.86s/it]

test：0.0, test mean: 0.30158730158730157
([7, 3, 7], [7, 3, 7])
tensor(18.5689, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (405503, -17.191875457763672), (380927, 2.7659730911254883)]
([7, 3, 7], [7, 3, 7])
tensor(14.5456, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.601123809814453), (405503, -30.550445556640625), (380927, 14.259552955627441)]
([7, 3, 7], [7, 3, 7])
tensor(7.9939, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.675745010375977), (405503, -62.81367111206055), (380927, 120.1441879272461)]
([0, 3, 7], [7, 3, 7])
tensor(2.9751, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.669595718383789), (405503, -99.27552032470703), (380927, 120.04300689697266)]
([0, 3, 0], [7, 3, 7])
tensor(1.5177, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.663448333740234), (405503, -109.00079345703125), (380927, 120.27217102050781)]
([0, 0, 0], [7, 3, 7])
tensor(0.2126, device='cuda:0', grad_fn=<NllLossBackward0>)
[(

 25%|██▌       | 184/725 [6:19:28<31:07:54, 207.16s/it]

test：0.0, test mean: 0.296875
([7, 7, 2], [7, 7, 2])
tensor(23.8568, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.888364791870117), (499999, 0.0), (499999, -0.011318929493427277)]
([7, 7, 2], [7, 7, 2])
tensor(22.7545, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 25.09968376159668), (499999, 0.0), (499999, -0.022637858986854553)]
([7, 7, 2], [7, 7, 2])
tensor(19.3189, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 23.71428680419922), (499999, 0.0), (499999, -0.03395676240324974)]
([0, 7, 2], [7, 7, 2])
tensor(17.8644, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 23.714448928833008), (499999, 0.0), (499999, -0.045275717973709106)]
([0, 7, 2], [7, 7, 2])
tensor(17.8644, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 23.714609146118164), (499999, 0.0), (499999, -0.056594640016555786)]
([0, 7, 2], [7, 7, 2])
tensor(17.8644, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 23.714771270751953), (499999, 0.0), (499999, -0.06791353225708008

 26%|██▌       | 185/725 [6:24:26<35:08:16, 234.25s/it]

test：0.0, test mean: 0.2923076923076923
([1, 3, 2], [1, 3, 2])
tensor(24.1780, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -17.97531509399414), (499999, 0.0)]
([1, 3, 2], [1, 3, 2])
tensor(23.5103, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -34.26577377319336), (499999, 0.0)]
([1, 3, 2], [1, 3, 2])
tensor(22.1328, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -46.634769439697266), (499999, 0.0)]
([1, 3, 2], [1, 3, 2])
tensor(21.1366, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -54.992820739746094), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(20.7710, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -55.990806579589844), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(20.6651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -51.619022369384766), (499999, 0.0)]
([1, 0, 2], [1, 3, 2])
tensor(20.5651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(

 26%|██▌       | 186/725 [6:29:23<37:53:15, 253.05s/it]

test：0.0, test mean: 0.2878787878787879
([1, 7, 0], [1, 7, 0])
tensor(15.2872, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9369421005249023), (499999, 0.0)]
([1, 7, 0], [1, 7, 0])
tensor(11.8987, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 18.361038208007812), (499999, 0.0)]
([1, 0, 0], [1, 7, 0])
tensor(8.9110, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.36429405212402344), (499999, 0.0)]
([1, 0, 0], [1, 7, 0])
tensor(8.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.36423397064208984), (499999, 0.0)]
([1, 0, 0], [1, 7, 0])
tensor(8.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.36417293548583984), (499999, 0.0)]
([1, 0, 0], [1, 7, 0])
tensor(8.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.36411380767822266), (499999, 0.0)]
([1, 0, 0], [1, 7, 0])
tensor(8.7106, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49999

 26%|██▌       | 187/725 [6:34:12<39:25:17, 263.79s/it]

test：0.0, test mean: 0.2835820895522388
([0, 2, 1], [0, 2, 1])
tensor(15.1059, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▌       | 188/725 [6:34:39<28:45:07, 192.75s/it]

([1, 2, 1], [1, 2, 1])
tensor(30.3288, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▌       | 189/725 [6:35:05<21:15:54, 142.83s/it]

([2, 1, 1], [2, 1, 1])
tensor(25.0220, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▌       | 190/725 [6:35:32<16:03:59, 108.11s/it]

([0, 0, 6], [0, 0, 6])
tensor(3.9948, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▋       | 191/725 [6:35:58<12:23:38, 83.56s/it] 

([2, 0, 1], [2, 0, 1])
tensor(15.6524, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 26%|██▋       | 192/725 [6:36:24<9:48:56, 66.30s/it] 

([2, 6, 2], [2, 6, 2])
tensor(22.9591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 27%|██▋       | 193/725 [6:36:50<8:01:02, 54.25s/it]

([7, 3, 2], [7, 7, 2])
tensor(11.5251, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 27%|██▋       | 194/725 [6:37:17<6:46:09, 45.89s/it]

([0, 1, 0], [0, 1, 7])
tensor(8.5213, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 27%|██▋       | 195/725 [6:37:45<5:58:48, 40.62s/it]

([7, 7, 1], [7, 7, 1])
tensor(22.5457, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (368639, 2.8883633613586426), (499999, 0.0)]
([7, 7, 1], [7, 7, 1])
tensor(18.1273, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.364208221435547), (368639, 8.384690284729004), (499999, 0.0)]
([7, 7, 1], [7, 7, 1])
tensor(11.0965, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 34.262107849121094), (368639, 4.4343366622924805), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(7.8565, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 34.19803237915039), (368639, 4.434338092803955), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(7.8565, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 34.13396072387695), (368639, 4.4343390464782715), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(7.8565, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 34.07188415527344), (368639, 4.434342384338379), (499999, 0.0)]
([0, 0, 1], [7, 7, 1])
tensor(7.8565, device='cuda

 27%|██▋       | 196/725 [6:42:32<16:48:17, 114.36s/it]

test：0.0, test mean: 0.27941176470588236
([2, 2, 5], [2, 2, 5])
tensor(27.9896, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 27%|██▋       | 197/725 [6:42:58<12:54:03, 87.96s/it] 

([0, 5, 0], [7, 5, 0])
tensor(4.6171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, 2.993383674265715e-08)]
([0, 5, 0], [7, 5, 0])
tensor(4.6171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, -1.0611040579533437e-06)]
([0, 5, 0], [7, 5, 0])
tensor(4.6171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, -1.0323699370928807e-06)]
([0, 5, 0], [7, 5, 0])
tensor(4.6171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, 0.00014961071428842843)]
([0, 5, 0], [7, 5, 0])
tensor(4.6171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, 0.0001496437907917425)]
([0, 5, 0], [7, 5, 0])
tensor(4.6171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (356351, 0.00014967689639888704)]
([0, 5, 0], [7, 5, 0])
tensor(4.6171, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0)

 27%|██▋       | 198/725 [6:48:14<22:52:43, 156.29s/it]

test：1.0, test mean: 0.2898550724637681
([1, 2, 0], [1, 2, 0])
tensor(20.1031, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.004360351245850325)]


 27%|██▋       | 199/725 [6:48:40<17:08:12, 117.29s/it]

([2, 2, 5], [2, 2, 5])
tensor(24.7152, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.01973973959684372), (454655, -2.9430694580078125)]
([2, 2, 5], [2, 2, 5])
tensor(24.5145, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.03947947919368744), (454655, -19.289798736572266)]
([2, 2, 5], [2, 2, 5])
tensor(21.9313, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.05921921879053116), (454655, -48.47603988647461)]
([2, 2, 1], [2, 2, 5])
tensor(20.4866, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.07895895838737488), (454655, -65.57984924316406)]
([2, 2, 5], [2, 2, 5])
tensor(22.4368, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.098698690533638), (454655, -96.67008972167969)]
([2, 2, 1], [2, 2, 5])
tensor(21.6101, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.11843843758106232), (454655, -142.98583984375)]
([2, 2, 0], [2, 2, 5])
tensor(

 28%|██▊       | 200/725 [6:53:41<25:08:05, 172.35s/it]

test：0.0, test mean: 0.2857142857142857
([0, 0, 1], [0, 0, 1])
tensor(6.1429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 2.3994084585865494e-06), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 7.4656350079749245e-06), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 6.033171757735545e-06), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 6.033150384610053e-06), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 6.0171596487634815e-06), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, 6.017166469973745e-06), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(6.1429, device='cuda:0', grad_fn=<NllLossBack

 28%|██▊       | 201/725 [6:58:03<29:01:24, 199.40s/it]

test：1.0, test mean: 0.29577464788732394
([8, 1, 2], [8, 1, 2])
tensor(17.6903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 28%|██▊       | 202/725 [6:58:27<21:17:53, 146.60s/it]

([1, 7, 7], [1, 7, 7])
tensor(22.8285, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (380927, 2.712027072906494)]
([1, 7, 7], [1, 7, 7])
tensor(18.1240, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 12.905088424682617), (380927, 29.367753982543945)]
([1, 7, 0], [1, 7, 7])
tensor(12.1405, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.9865031242370605), (380927, 29.396488189697266)]
([1, 0, 0], [1, 7, 7])
tensor(7.8082, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.97460412979126), (380927, 29.421035766601562)]
([1, 0, 0], [1, 7, 7])
tensor(7.8078, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.967017650604248), (380927, 29.445579528808594)]
([1, 0, 0], [1, 7, 7])
tensor(7.8072, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.967531204223633), (380927, 29.4573974609375)]
([1, 0, 0], [1, 7, 7])
tensor(7.8072, device='cuda

 28%|██▊       | 203/725 [7:03:11<27:14:25, 187.87s/it]

test：0.0, test mean: 0.2916666666666667
([0, 1, 2], [0, 1, 2])
tensor(16.1995, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 28%|██▊       | 204/725 [7:03:35<20:05:58, 138.88s/it]

([2, 1, 2], [2, 1, 2])
tensor(18.3502, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 28%|██▊       | 205/725 [7:03:58<15:01:12, 103.98s/it]

([2, 1, 5], [2, 1, 1])
tensor(20.8298, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 28%|██▊       | 206/725 [7:04:21<11:30:26, 79.82s/it] 

([1, 0, 1], [1, 0, 1])
tensor(10.8097, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -8.040768571804247e-13), (499999, 0.0)]


 29%|██▊       | 207/725 [7:04:45<9:04:37, 63.08s/it] 

([5, 8, 0], [5, 8, 0])
tensor(9.0495, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.3114342841145117e-06)]
([5, 8, 0], [5, 8, 0])
tensor(9.0495, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.5102621041005477e-06)]
([5, 8, 0], [5, 8, 0])
tensor(9.0495, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.510138640194782e-06)]
([5, 8, 0], [5, 8, 0])
tensor(9.0495, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.5100156310363673e-06)]
([5, 8, 0], [5, 8, 0])
tensor(9.0495, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.5098935313726543e-06)]
([5, 8, 0], [5, 8, 0])
tensor(9.0495, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.509771886456292e-06)]
([5, 8, 0], [5, 8, 0])
tensor(9.0495, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), 

 29%|██▊       | 208/725 [7:09:31<18:37:54, 129.74s/it]

test：1.0, test mean: 0.3013698630136986
([1, 3, 2], [1, 3, 2])
tensor(20.5539, device='cuda:0', grad_fn=<NllLossBackward0>)


 29%|██▉       | 209/725 [7:09:54<14:00:19, 97.71s/it] 

[(499999, 0.0), (499999, 0.0), (499999, -0.009891888126730919)]
([1, 5, 1], [1, 5, 1])
tensor(16.2037, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.004490381106734276)]


 29%|██▉       | 210/725 [7:10:18<10:50:32, 75.79s/it]

([7, 7, 5], [7, 7, 5])
tensor(12.5401, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181406497955322), (368639, 2.9181385040283203), (499999, 0.07260040938854218)]
([7, 7, 5], [7, 7, 5])
tensor(6.1689, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 59.15118408203125), (368639, 54.51333999633789), (499999, 0.14520081877708435)]
([7, 7, 5], [7, 7, 5])
tensor(3.9986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 79.09294128417969), (368639, 56.21613693237305), (499999, 0.21780119836330414)]
([0, 0, 5], [7, 7, 5])
tensor(2.2879, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 82.52395629882812), (368639, 56.21649169921875), (499999, 0.2904016375541687)]
([0, 0, 5], [7, 7, 5])
tensor(2.2596, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 82.52412414550781), (368639, 56.21684265136719), (499999, 0.36300212144851685)]
([0, 0, 5], [7, 7, 5])
tensor(2.2596, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 82.52429962158203), (368639, 56.21719741

 29%|██▉       | 211/725 [7:14:36<18:35:37, 130.23s/it]

test：0.0, test mean: 0.2972972972972973
([2, 8, 7], [2, 8, 7])
tensor(16.2436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -13.141175270080566), (499999, 0.0)]
([2, 8, 7], [2, 8, 7])
tensor(16.7298, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -43.30940246582031), (499999, 0.0)]
([2, 8, 7], [2, 8, 7])
tensor(16.7202, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -74.80691528320312), (499999, 0.0)]
([2, 8, 7], [2, 8, 7])
tensor(16.5223, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -107.99346923828125), (499999, 0.0)]
([2, 8, 7], [2, 8, 7])
tensor(16.0344, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -141.54022216796875), (499999, 0.0)]
([2, 8, 7], [2, 8, 7])
tensor(15.4782, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -173.42538452148438), (499999, 0.0)]
([2, 8, 7], [2, 8, 7])
tensor(14.8501, device='cuda:0', grad_fn=<NllLossBackward0>)
[(

 29%|██▉       | 212/725 [7:18:59<24:14:44, 170.14s/it]

test：0.0, test mean: 0.29333333333333333
([0, 2, 1], [0, 2, 1])
tensor(14.4712, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 29%|██▉       | 213/725 [7:19:24<17:59:33, 126.51s/it]

([1, 3, 2], [1, 3, 2])
tensor(20.2045, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|██▉       | 214/725 [7:19:49<13:38:28, 96.10s/it] 

([1, 5, 5], [1, 5, 5])
tensor(12.9747, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|██▉       | 215/725 [7:20:12<10:31:50, 74.33s/it]

([0, 7, 2], [0, 7, 2])
tensor(16.7567, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.005301639437675476), (380927, 2.9181621074676514), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(13.2401, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.012608939781785011), (380927, 19.127487182617188), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(11.5966, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.41635602712631226), (380927, 0.25707149505615234), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(7.6878, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.41488659381866455), (380927, 9.522074699401855), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(7.4976, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.478307843208313), (380927, 9.517751693725586), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(7.4969, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, 0.4784483015537262), (380927, 9.513425827026367), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(7.4969, de

 30%|██▉       | 216/725 [7:24:35<18:30:25, 130.89s/it]

test：0.5, test mean: 0.29605263157894735
([7, 0, 1], [7, 0, 1])
tensor(14.8137, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(12.7523, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 26.603300094604492), (499999, 0.0), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(11.9726, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 28.868349075317383), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(8.3216, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.966980934143066), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(8.2918, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 62.29793930053711), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(8.1283, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 62.1373291015625), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(8.1133, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 62

 30%|██▉       | 217/725 [7:28:55<23:56:28, 169.66s/it]

test：0.0, test mean: 0.2922077922077922
([0, 7, 1], [0, 7, 1])
tensor(6.3163, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9369421005249023), (499999, 0.0)]
([0, 7, 1], [0, 7, 1])
tensor(4.4533, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 19.52411460876465), (499999, 0.0)]
([0, 7, 1], [0, 7, 1])
tensor(2.5198, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 10.535852432250977), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(0.3488, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 10.136140823364258), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(0.3486, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 9.790958404541016), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(0.3485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 9.457329750061035), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(0.3483, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), 

 30%|███       | 218/725 [7:33:18<27:50:47, 197.73s/it]

test：0.0, test mean: 0.28846153846153844
([0, 0, 1], [0, 0, 1])
tensor(4.5000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, 6.188610768731451e-06), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(4.5000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -4.24510108132381e-06), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(4.5000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -4.245126092428109e-06), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(4.5000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -4.245149284543004e-06), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(4.5000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -4.245173386152601e-06), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(4.5000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (442367, -4.245197942509549e-06), (499999, 0.0)]
([0, 0, 1], [0, 0, 1])
tensor(4.5000, device='cuda:0', grad_fn=<NllLossBa

 30%|███       | 219/725 [7:37:41<30:32:03, 217.24s/it]

test：1.0, test mean: 0.2974683544303797
([2, 1, 2], [2, 1, 2])
tensor(28.4556, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|███       | 220/725 [7:38:05<22:20:02, 159.21s/it]

([7, 1, 1], [7, 1, 1])
tensor(14.1138, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 30%|███       | 221/725 [7:38:29<16:37:05, 118.70s/it]

([2, 6, 2], [2, 6, 2])
tensor(19.7689, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 31%|███       | 222/725 [7:38:54<12:38:50, 90.52s/it] 

([0, 0, 7], [0, 7, 7])
tensor(8.2507, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9181621074676514)]
([0, 0, 7], [0, 7, 7])
tensor(5.3241, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 21.02709197998047)]
([0, 0, 7], [0, 7, 7])
tensor(4.3131, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 17.945573806762695)]
([0, 0, 7], [0, 7, 7])
tensor(0.8609, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -21.970016479492188)]
([0, 0, 0], [0, 7, 7])
tensor(0.0244, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -23.027870178222656)]
([0, 0, 0], [0, 7, 7])
tensor(0.0234, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -23.853609085083008)]
([0, 0, 0], [0, 7, 7])
tensor(0.0234, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -24.6793403

 31%|███       | 223/725 [7:43:31<20:24:52, 146.40s/it]

test：0.0, test mean: 0.29375
([2, 1, 8], [2, 1, 8])
tensor(15.6176, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 31%|███       | 224/725 [7:43:55<15:17:22, 109.87s/it]

([2, 8, 7], [2, 8, 7])
tensor(15.7553, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (459263, 2.6455941200256348)]
([2, 8, 0], [2, 8, 7])
tensor(13.8071, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (459263, 3.6026453971862793)]
([2, 8, 3], [2, 8, 7])
tensor(18.1701, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (459263, 253.24948120117188)]
([2, 8, 7], [2, 8, 7])
tensor(14.6474, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (459263, 274.29742431640625)]
([2, 8, 0], [2, 8, 7])
tensor(13.8259, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (459263, 275.94384765625)]
([2, 8, 0], [2, 8, 7])
tensor(13.8125, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (459263, 277.25140380859375)]
([2, 8, 0], [2, 8, 7])
tensor(13.8013, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (459263, 277.58319

 31%|███       | 225/725 [7:48:26<21:56:23, 157.97s/it]

test：0.0, test mean: 0.29012345679012347
([2, 2, 5], [2, 2, 7])
tensor(23.1107, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 31%|███       | 226/725 [7:48:51<16:22:34, 118.15s/it]

([2, 1, 8], [2, 1, 8])
tensor(22.0747, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 31%|███▏      | 227/725 [7:49:16<12:28:48, 90.22s/it] 

([1, 8, 2], [1, 8, 2])
tensor(22.8089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 31%|███▏      | 228/725 [7:49:40<9:43:41, 70.47s/it] 

([1, 8, 1], [1, 8, 1])
tensor(18.5560, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 32%|███▏      | 229/725 [7:50:06<7:52:21, 57.14s/it]

([2, 2, 1], [2, 2, 1])
tensor(29.8091, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 32%|███▏      | 230/725 [7:50:31<6:29:48, 47.25s/it]

([2, 0, 8], [2, 0, 8])
tensor(13.6467, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 32%|███▏      | 231/725 [7:50:55<5:32:23, 40.37s/it]

([0, 6, 5], [1, 6, 5])
tensor(8.2874, device='cuda:0', grad_fn=<NllLossBackward0>)


 32%|███▏      | 232/725 [7:51:18<4:49:43, 35.26s/it]

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([1, 0, 7], [1, 0, 7])
tensor(16.9526, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9181621074676514)]
([1, 0, 7], [1, 0, 7])
tensor(13.7086, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 13.57709789276123)]
([1, 0, 7], [1, 0, 7])
tensor(12.6131, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -1.2215147018432617)]
([1, 0, 7], [1, 0, 7])
tensor(8.6663, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 5.644966125488281)]
([1, 0, 0], [1, 0, 7])
tensor(8.3742, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 5.641718864440918)]
([1, 0, 0], [1, 0, 7])
tensor(8.3742, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 5.638472557067871)]
([1, 0, 0], [1, 0, 7])
tensor(8.3742, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49999

 32%|███▏      | 233/725 [7:55:51<14:32:29, 106.40s/it]

test：0.0, test mean: 0.2865853658536585
([0, 1, 8], [0, 1, 8])
tensor(12.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -5.607581670119544e-07), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(12.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -1.1215163340239087e-06), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(12.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -1.6822746147227008e-06), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(12.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -2.2430326680478174e-06), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(12.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -2.8037909487466095e-06), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(12.4347, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -3.3645492294454016e-06), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [0, 1, 8])
tensor(12.4347, device='cuda:0', grad_

 32%|███▏      | 234/725 [8:00:09<20:44:19, 152.06s/it]

test：1.0, test mean: 0.29518072289156627
([6, 7, 8], [6, 7, 8])
tensor(18.0683, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([6, 7, 8], [6, 7, 8])
tensor(14.9067, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 12.868511199951172), (499999, 0.0)]
([6, 7, 8], [6, 7, 8])
tensor(13.8276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.6677255630493164), (499999, 0.0)]
([6, 7, 8], [6, 7, 8])
tensor(9.8157, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.03402042388916), (499999, 0.0)]
([6, 0, 8], [6, 7, 8])
tensor(9.4701, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.002435684204102), (499999, 0.0)]
([6, 0, 8], [6, 7, 8])
tensor(9.4701, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 10.970855712890625), (499999, 0.0)]
([6, 0, 8], [6, 7, 8])
tensor(9.4701, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 32%|███▏      | 235/725 [8:04:27<25:00:07, 183.69s/it]

test：0.0, test mean: 0.2916666666666667
([0, 0, 7], [0, 0, 7])
tensor(5.8867, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.5570147037506104)]
([0, 0, 7], [0, 0, 7])
tensor(5.9474, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.7581729888916016)]
([0, 0, 7], [0, 0, 7])
tensor(5.4020, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 3.2692322731018066)]
([0, 0, 7], [0, 0, 7])
tensor(3.5704, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -9.431478500366211)]
([0, 0, 7], [0, 0, 7])
tensor(3.8962, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -18.847415924072266)]
([0, 0, 3], [0, 0, 7])
tensor(2.2225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 12.637579917907715)]
([0, 0, 7], [0, 0, 7])
tensor(1.7807, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 33%|███▎      | 236/725 [8:09:06<28:50:20, 212.31s/it]

test：0.0, test mean: 0.28823529411764703
([7, 1, 1], [7, 1, 1])
tensor(22.3814, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.712027072906494), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(18.9565, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 26.294410705566406), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(14.8043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.471762657165527), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(13.5027, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.470354080200195), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(13.5027, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.46894645690918), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(13.5027, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.467536926269531), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(13.5027, device='cuda:0', grad_fn=<NllLossBackward0>)
[(38092

 33%|███▎      | 237/725 [8:13:27<30:46:32, 227.03s/it]

test：0.0, test mean: 0.28488372093023256
([1, 5, 5], [1, 8, 5])
tensor(13.9943, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 33%|███▎      | 238/725 [8:13:53<22:33:41, 166.78s/it]

([2, 1, 2], [2, 1, 2])
tensor(19.4786, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 33%|███▎      | 239/725 [8:14:17<16:43:18, 123.87s/it]

([0, 8, 0], [7, 8, 0])
tensor(0.5962, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.009669295512139797), (479231, -1.246716022491455)]
([0, 8, 0], [7, 8, 0])
tensor(0.4882, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.019338591024279594), (479231, -1.2389944791793823)]
([0, 8, 0], [7, 8, 0])
tensor(0.4882, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.029007887467741966), (479231, -1.2343082427978516)]
([0, 8, 0], [7, 8, 0])
tensor(0.4882, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.03867718204855919), (479231, -1.233642578125)]
([0, 8, 0], [7, 8, 0])
tensor(0.4882, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.04834648221731186), (479231, -1.2330467700958252)]
([0, 8, 0], [7, 8, 0])
tensor(0.4882, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.058015771210193634), (479231, -1.232332706451416)]
([0, 8, 0], [7, 8, 0])
tensor(

 33%|███▎      | 240/725 [8:18:46<22:32:09, 167.28s/it]

test：1.0, test mean: 0.29310344827586204
([2, 1, 0], [2, 1, 0])
tensor(14.2659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 8.397930173487111e-07)]
([2, 1, 0], [2, 1, 0])
tensor(14.2659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -7.20786061947365e-08)]
([2, 1, 0], [2, 1, 0])
tensor(14.2659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -1.0297908374923281e-05)]
([2, 1, 0], [2, 1, 0])
tensor(14.2659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -1.2582126146298833e-05)]
([2, 1, 0], [2, 1, 0])
tensor(14.2659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 0.00012411152420099825)]
([2, 1, 0], [2, 1, 0])
tensor(14.2659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 0.00012411152420099825)]
([2, 1, 0], [2, 1, 0])
tensor(14.2659, device='cuda:0', grad_fn=<

 33%|███▎      | 241/725 [8:23:12<26:28:58, 196.98s/it]

test：1.0, test mean: 0.30113636363636365
([2, 1, 2], [2, 1, 2])
tensor(27.3687, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 33%|███▎      | 242/725 [8:23:38<19:32:37, 145.67s/it]

([1, 5, 2], [1, 5, 2])
tensor(18.9596, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 34%|███▎      | 243/725 [8:24:03<14:39:53, 109.53s/it]

([1, 1, 1], [1, 1, 1])
tensor(20.6553, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 34%|███▎      | 244/725 [8:24:27<11:11:57, 83.82s/it] 

([8, 0, 3], [8, 0, 3])
tensor(7.5736, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -11.805475234985352), (499999, 0.0), (499999, 0.0)]
([8, 0, 3], [8, 0, 3])
tensor(7.7683, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -36.72368240356445), (499999, 0.0), (499999, 0.0)]
([8, 0, 3], [8, 0, 3])
tensor(7.8134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -57.3905029296875), (499999, 0.0), (499999, 0.0)]
([8, 0, 3], [8, 0, 3])
tensor(7.8104, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -78.07681274414062), (499999, 0.0), (499999, 0.0)]
([8, 0, 3], [8, 0, 3])
tensor(7.8104, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -98.76311492919922), (499999, 0.0), (499999, 0.0)]
([8, 0, 3], [8, 0, 3])
tensor(7.8104, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -119.44942474365234), (499999, 0.0), (499999, 0.0)]
([8, 0, 3], [8, 0, 3])
tensor(7.8103, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -140.13507080078125), (499999, 0.0), (4999

 34%|███▍      | 245/725 [8:28:46<18:11:38, 136.45s/it]

test：1.0, test mean: 0.3089887640449438
([2, 2, 1], [2, 2, 1])
tensor(21.4279, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 34%|███▍      | 246/725 [8:29:12<13:43:29, 103.15s/it]

([1, 2, 0], [1, 2, 1])
tensor(16.1481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.000282337365206331)]
([1, 2, 0], [1, 2, 1])
tensor(16.1481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.0005867457948625088)]
([1, 2, 0], [1, 2, 1])
tensor(16.1481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.0008907649898901582)]
([1, 2, 0], [1, 2, 1])
tensor(16.1481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.0011947843013331294)]
([1, 2, 0], [1, 2, 1])
tensor(16.1481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.0014988034963607788)]
([1, 2, 0], [1, 2, 1])
tensor(16.1481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -0.0018028229242190719)]
([1, 2, 0], [1, 2, 1])
tensor(16.1481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999

 34%|███▍      | 247/725 [8:34:08<21:22:43, 161.01s/it]

test：0.0, test mean: 0.3055555555555556
([1, 0, 0], [1, 0, 1])
tensor(8.2015, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.006782697048038244)]


 34%|███▍      | 248/725 [8:34:37<16:05:42, 121.47s/it]

([3, 1, 1], [5, 1, 1])
tensor(22.2560, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 34%|███▍      | 249/725 [8:35:08<12:28:56, 94.40s/it] 

([1, 2, 1], [1, 2, 1])
tensor(27.0685, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 34%|███▍      | 250/725 [8:35:37<9:51:49, 74.76s/it] 

([0, 2, 7], [0, 2, 7])
tensor(11.8426, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 4.535180408993256e-08), (499999, 0.0), (499999, 0.0)]


 35%|███▍      | 251/725 [8:36:10<8:10:49, 62.13s/it]

([7, 1, 7], [7, 1, 7])
tensor(20.7287, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.9181559085845947), (499999, 0.0), (380927, -0.3320995271205902)]
([7, 1, 7], [7, 1, 7])
tensor(16.5179, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 49.765953063964844), (499999, 0.0), (380927, 20.191207885742188)]
([7, 1, 7], [7, 1, 7])
tensor(10.1854, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 52.46993637084961), (499999, 0.0), (380927, 3.1496474742889404)]
([0, 1, 0], [7, 1, 7])
tensor(7.3960, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 52.47087097167969), (499999, 0.0), (380927, 3.1637957096099854)]
([0, 1, 0], [7, 1, 7])
tensor(7.3960, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 52.4718017578125), (499999, 0.0), (380927, 3.175877571105957)]
([0, 1, 0], [7, 1, 7])
tensor(7.3960, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 52.47273254394531), (499999, 0.0), (380927, 3.1857776641845703)]
([0, 1, 0], [7, 1, 7])
tensor(7.3960, device='cu

 35%|███▍      | 252/725 [8:42:15<20:07:42, 153.20s/it]

test：0.0, test mean: 0.3021978021978022
([8, 1, 0], [8, 1, 0])
tensor(3.6000, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 35%|███▍      | 253/725 [8:42:46<15:15:15, 116.35s/it]

([2, 0, 0], [2, 0, 0])
tensor(11.6906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -1.8218173863715492e-05), (442367, 4.234968855598709e-06)]
([2, 0, 0], [2, 0, 0])
tensor(11.6906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -3.6436347727430984e-05), (442367, 7.63329808251001e-06)]
([2, 0, 0], [2, 0, 0])
tensor(11.6906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -5.465452341013588e-05), (442367, 8.224173143389635e-06)]
([2, 0, 0], [2, 0, 0])
tensor(11.6906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -7.287269545486197e-05), (442367, 8.224111297749914e-06)]
([2, 0, 0], [2, 0, 0])
tensor(11.6906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -9.109088568948209e-05), (442367, 8.218531547754537e-06)]
([2, 0, 0], [2, 0, 0])
tensor(11.6906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.00010930908320005983), (442367, 8.218480616051238e-0

 35%|███▌      | 254/725 [8:48:51<24:58:59, 190.95s/it]

test：1.0, test mean: 0.30978260869565216
([5, 8, 2], [5, 8, 2])
tensor(18.8599, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 35%|███▌      | 255/725 [8:49:21<18:38:23, 142.77s/it]

([0, 8, 0], [0, 8, 0])
tensor(3.9241, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 35%|███▌      | 256/725 [8:49:51<14:12:30, 109.06s/it]

([2, 8, 1], [2, 8, 1])
tensor(21.1395, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -1.7483831470599398e-05), (499999, 0.0)]


 35%|███▌      | 257/725 [8:50:24<11:10:38, 85.98s/it] 

([2, 3, 2], [2, 3, 2])
tensor(21.8023, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▌      | 258/725 [8:51:03<9:19:19, 71.86s/it] 

([2, 2, 2], [2, 2, 2])
tensor(30.0463, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.009891888126730919)]


 36%|███▌      | 259/725 [8:51:33<7:42:43, 59.58s/it]

([8, 2, 2], [5, 2, 2])
tensor(16.0281, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▌      | 260/725 [8:52:06<6:38:54, 51.47s/it]

([2, 2, 3], [2, 2, 3])
tensor(23.1895, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▌      | 261/725 [8:52:38<5:52:38, 45.60s/it]

([8, 0, 1], [8, 0, 1])
tensor(8.5429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.0003213096351828426), (499999, 0.0)]
([8, 0, 1], [8, 0, 1])
tensor(8.5429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.0033495977986603975), (499999, 0.0)]
([8, 0, 1], [8, 0, 1])
tensor(8.5428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.003349504666402936), (499999, 0.0)]
([8, 0, 1], [8, 0, 1])
tensor(8.5428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.0033494119998067617), (499999, 0.0)]
([8, 0, 1], [8, 0, 1])
tensor(8.5428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.0033493186347186565), (499999, 0.0)]
([8, 0, 1], [8, 0, 1])
tensor(8.5428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.003349226200953126), (499999, 0.0)]
([8, 0, 1], [8, 0, 1])
tensor(8.5428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.0033491335

 36%|███▌      | 262/725 [8:57:43<15:53:06, 123.51s/it]

test：1.0, test mean: 0.3172043010752688
([2, 2, 2], [2, 2, 2])
tensor(22.2779, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 36%|███▋      | 263/725 [8:58:11<12:10:41, 94.90s/it] 

([0, 1, 1], [0, 1, 1])
tensor(13.6275, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.010213741101324558)]


 36%|███▋      | 264/725 [8:58:41<9:37:45, 75.20s/it] 

([7, 0, 2], [7, 0, 2])
tensor(12.8863, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(9.9853, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 22.598499298095703), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(6.9450, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.498944282531738), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(5.9058, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.498978614807129), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(5.9058, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.499016761779785), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(5.9058, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.499049186706543), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(5.9058, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.499083518981934), (499999, 0.0), (49999

 37%|███▋      | 265/725 [9:03:48<18:30:01, 144.79s/it]

test：0.0, test mean: 0.31382978723404253
([0, 2, 3], [0, 2, 4])
tensor(14.4339, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 37%|███▋      | 266/725 [9:04:15<13:57:20, 109.46s/it]

([1, 6, 1], [1, 6, 1])
tensor(19.7357, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 37%|███▋      | 267/725 [9:04:42<10:48:16, 84.93s/it] 

([0, 0, 8], [7, 0, 8])
tensor(1.9390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.00017270246462430805), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tensor(1.9390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.00025927548995241523), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tensor(1.9392, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.01915084943175316), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tensor(1.9392, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.02600257471203804), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tensor(1.9390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.025992488488554955), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tensor(1.9390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.026073619723320007), (499999, 0.0)]
([0, 0, 8], [7, 0, 8])
tensor(1.9390, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (405503, -0.02614

 37%|███▋      | 268/725 [9:09:50<19:16:12, 151.80s/it]

test：1.0, test mean: 0.32105263157894737
([0, 8, 1], [0, 8, 1])
tensor(11.6551, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.10060301423072815), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [0, 8, 1])
tensor(11.6625, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.022122830152511597), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [0, 8, 1])
tensor(11.6466, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.021905064582824707), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [0, 8, 1])
tensor(11.6466, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -0.021720774471759796), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [0, 8, 1])
tensor(11.6476, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 0.19230782985687256), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [0, 8, 1])
tensor(11.6468, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 0.1993064284324646), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [0, 8, 1])
tensor(11.6466, device='cuda:0', grad_fn=<NllLossBackw

 37%|███▋      | 269/725 [9:14:57<25:07:49, 198.40s/it]

test：1.0, test mean: 0.328125
([0, 3, 5], [0, 3, 5])
tensor(4.2762, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 37%|███▋      | 270/725 [9:15:27<18:40:11, 147.72s/it]

([5, 2, 1], [5, 2, 1])
tensor(18.7838, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -3.17907452583313), (499999, 0.0), (499999, 0.0)]
([5, 2, 1], [5, 2, 1])
tensor(18.0834, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -28.613449096679688), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(16.4466, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -28.590011596679688), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(16.4465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -28.55861473083496), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(16.4465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -28.527677536010742), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(16.4465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -28.499048233032227), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(16.4465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -28.47307014465332), (499999, 0.0

 37%|███▋      | 271/725 [9:20:29<24:28:56, 194.13s/it]

test：0.0, test mean: 0.3247422680412371
([2, 7, 2], [2, 7, 2])
tensor(16.3458, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 272/725 [9:20:58<18:10:36, 144.45s/it]

([0, 3, 1], [0, 3, 1])
tensor(11.3310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -5.699517302559798e-08), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.3310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -1.1399035315662331e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.3310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -1.7098552973493497e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.3310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -2.2798070631324663e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.3310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -2.84975897102413e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.3310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -3.4197105946986994e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.3310, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -3.989662

 38%|███▊      | 273/725 [9:26:03<24:11:56, 192.73s/it]

test：1.0, test mean: 0.33163265306122447
([7, 7, 0], [7, 7, 0])
tensor(14.3110, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.9423108100891113), (380927, 2.9181621074676514), (499999, 0.0)]
([7, 7, 0], [7, 7, 0])
tensor(9.5731, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 14.513225555419922), (380927, 21.811643600463867), (499999, 0.0)]
([7, 7, 0], [7, 7, 0])
tensor(6.4152, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -2.562006950378418), (380927, 20.194076538085938), (499999, 0.0)]
([0, 0, 0], [7, 7, 0])
tensor(0.0349, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -2.574009656906128), (380927, 23.12480354309082), (499999, 0.0)]
([0, 0, 0], [7, 7, 0])
tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -2.5860159397125244), (380927, 23.122549057006836), (499999, 0.0)]
([0, 0, 0], [7, 7, 0])
tensor(0.0001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -2.5980234146118164), (380927, 23.12029266357422), (499999, 0.0)]
([0,

 38%|███▊      | 274/725 [9:31:03<28:09:40, 224.79s/it]

test：0.0, test mean: 0.3282828282828283
([2, 3, 0], [2, 3, 0])
tensor(11.3036, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -14.291259765625), (499999, 0.0)]
([2, 3, 0], [2, 3, 0])
tensor(11.0980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -43.905052185058594), (499999, 0.0)]
([2, 3, 0], [2, 3, 0])
tensor(10.6877, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -83.26394653320312), (499999, 0.0)]
([2, 3, 0], [2, 3, 0])
tensor(9.8880, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -112.62107849121094), (499999, 0.0)]
([2, 3, 0], [2, 3, 0])
tensor(9.3586, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -141.59112548828125), (499999, 0.0)]
([2, 3, 0], [2, 3, 0])
tensor(8.9373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (245759, -169.450927734375), (499999, 0.0)]
([2, 3, 0], [2, 3, 0])
tensor(8.6769, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 38%|███▊      | 275/725 [9:36:07<31:03:53, 248.52s/it]

test：1.0, test mean: 0.335
([1, 1, 1], [1, 1, 1])
tensor(16.3286, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 276/725 [9:36:31<22:35:56, 181.19s/it]

([3, 0, 8], [3, 0, 8])
tensor(14.3595, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 277/725 [9:36:55<16:41:10, 134.09s/it]

([8, 2, 2], [8, 2, 2])
tensor(26.1968, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 38%|███▊      | 278/725 [9:37:21<12:38:15, 101.78s/it]

([6, 5, 1], [6, 5, 1])
tensor(14.6577, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.000500866852235049), (499999, 0.0)]


 38%|███▊      | 279/725 [9:37:46<9:44:24, 78.62s/it]  

([1, 2, 8], [1, 2, 8])
tensor(19.1091, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 39%|███▊      | 280/725 [9:38:12<7:46:53, 62.95s/it]

([3, 2, 6], [3, 2, 6])
tensor(18.1293, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 39%|███▉      | 281/725 [9:38:38<6:22:17, 51.66s/it]

([0, 3, 2], [0, 3, 2])
tensor(13.9982, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 39%|███▉      | 282/725 [9:39:05<5:28:27, 44.49s/it]

([8, 1, 7], [8, 1, 7])
tensor(17.9910, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0006427774205803871), (499999, 0.0), (380927, 2.9181621074676514)]
([8, 1, 7], [8, 1, 7])
tensor(16.0067, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0012855548411607742), (499999, 0.0), (380927, 21.462276458740234)]
([8, 1, 7], [8, 1, 7])
tensor(14.1741, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0019283294677734375), (499999, 0.0), (380927, 20.830448150634766)]
([8, 1, 0], [8, 1, 7])
tensor(11.8064, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0025711096823215485), (499999, 0.0), (380927, 20.93474578857422)]
([8, 1, 0], [8, 1, 7])
tensor(11.8063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0032138824462890625), (499999, 0.0), (380927, 21.020540237426758)]
([8, 1, 0], [8, 1, 7])
tensor(11.8063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.003856644034385681), (499999, 0.0), (380927, 21.0972843170166)]
([8, 1, 0], [8, 1, 7

 39%|███▉      | 283/725 [9:43:52<14:22:42, 117.11s/it]

test：0.0, test mean: 0.3316831683168317
([2, 2, 7], [2, 2, 7])
tensor(30.5009, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.7659730911254883)]
([2, 2, 7], [2, 2, 7])
tensor(27.4953, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 17.73682975769043)]
([2, 2, 7], [2, 2, 7])
tensor(25.2195, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 115.19287109375)]
([2, 2, 0], [2, 2, 7])
tensor(22.1577, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 115.38324737548828)]
([2, 2, 0], [2, 2, 7])
tensor(22.1551, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 115.52745819091797)]
([2, 2, 0], [2, 2, 7])
tensor(22.1533, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 115.56643676757812)]
([2, 2, 0], [2, 2, 7])
tensor(22.1521, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 39%|███▉      | 284/725 [9:48:38<20:34:11, 167.92s/it]

test：0.0, test mean: 0.3284313725490196
([7, 5, 1], [7, 5, 1])
tensor(18.7636, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9423108100891113), (499999, 0.0), (499999, 0.0)]
([7, 5, 1], [7, 5, 1])
tensor(16.5074, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 30.657974243164062), (499999, 0.0), (499999, 0.0)]
([7, 5, 1], [7, 5, 1])
tensor(14.4955, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 9.170475959777832), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(11.4304, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 9.001665115356445), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(11.4303, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.84292984008789), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(11.4303, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.67385196685791), (499999, 0.0), (499999, 0.0)]
([0, 5, 1], [7, 5, 1])
tensor(11.4303, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.

 39%|███▉      | 285/725 [9:52:59<23:54:45, 195.65s/it]

test：0.0, test mean: 0.32524271844660196
([8, 1, 1], [8, 1, 1])
tensor(15.2057, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.003973178565502167), (499999, 0.0), (499999, 0.0)]


 39%|███▉      | 286/725 [9:53:24<17:38:28, 144.67s/it]

([1, 1, 2], [1, 1, 2])
tensor(18.1622, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0005837425123900175), (499999, 0.0)]


 40%|███▉      | 287/725 [9:53:49<13:12:24, 108.55s/it]

([5, 7, 2], [5, 7, 2])
tensor(18.0409, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.9181525707244873), (499999, 0.0)]
([5, 7, 2], [5, 7, 2])
tensor(14.2207, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 54.27873992919922), (499999, 0.0)]
([5, 7, 2], [5, 7, 2])
tensor(12.7728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 55.55647659301758), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(11.5911, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 55.57359313964844), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(11.5911, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 55.59071350097656), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(11.5911, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 55.60601806640625), (499999, 0.0)]
([5, 0, 2], [5, 7, 2])
tensor(11.5911, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 55.62131881713867), (49999

 40%|███▉      | 288/725 [9:58:13<18:50:27, 155.21s/it]

test：0.0, test mean: 0.32211538461538464
([3, 1, 7], [3, 1, 7])
tensor(16.3966, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.712027072906494)]
([3, 1, 7], [3, 1, 7])
tensor(13.5372, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 19.247451782226562)]
([3, 1, 7], [3, 1, 7])
tensor(10.2131, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 21.875534057617188)]
([3, 1, 0], [3, 1, 7])
tensor(9.6772, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 21.875499725341797)]
([3, 1, 0], [3, 1, 7])
tensor(9.6772, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 21.87546730041504)]
([3, 1, 0], [3, 1, 7])
tensor(9.6772, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 21.875436782836914)]
([3, 1, 0], [3, 1, 7])
tensor(9.6772, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0

 40%|███▉      | 289/725 [10:02:56<23:26:09, 193.51s/it]

test：0.0, test mean: 0.319047619047619
([3, 2, 0], [3, 2, 0])
tensor(15.0302, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 40%|████      | 290/725 [10:03:20<17:15:06, 142.77s/it]

([3, 6, 0], [3, 6, 0])
tensor(9.0142, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -17.19182777404785), (499999, 0.0), (499999, 0.0)]
([3, 6, 0], [3, 6, 0])
tensor(8.5905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -30.550277709960938), (499999, 0.0), (499999, 0.0)]
([3, 6, 0], [3, 6, 0])
tensor(6.7051, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -29.5848388671875), (499999, 0.0), (499999, 0.0)]
([3, 6, 0], [3, 6, 0])
tensor(4.7021, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -60.482810974121094), (499999, 0.0), (499999, 0.0)]
([3, 6, 0], [3, 6, 0])
tensor(6.4257, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -93.9530029296875), (499999, 0.0), (499999, 0.0)]
([3, 6, 0], [3, 6, 0])
tensor(5.1737, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -84.58055114746094), (499999, 0.0), (499999, 0.0)]
([0, 6, 0], [3, 6, 0])
tensor(3.8877, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, -81.43624877929688), (499999, 0.0), (499999

 40%|████      | 291/725 [10:07:46<21:40:35, 179.80s/it]

test：0.0, test mean: 0.3160377358490566
([2, 8, 2], [2, 8, 2])
tensor(24.2135, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 40%|████      | 292/725 [10:08:13<16:06:26, 133.92s/it]

([1, 2, 2], [1, 2, 2])
tensor(24.2395, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0005937283858656883), (499999, 0.0), (499999, 0.0)]


 40%|████      | 293/725 [10:08:40<12:12:05, 101.68s/it]

([1, 7, 1], [1, 7, 1])
tensor(23.3032, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(20.8676, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 26.587846755981445), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(19.9255, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 32.0900993347168), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(16.4250, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 35.36499786376953), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(16.4099, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 35.36842727661133), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(16.4099, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 35.371849060058594), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(16.4099, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 35.375274658203125), (499

 41%|████      | 294/725 [10:13:12<18:18:06, 152.87s/it]

test：0.0, test mean: 0.3130841121495327
([2, 1, 2], [2, 1, 2])
tensor(22.9869, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 41%|████      | 295/725 [10:13:37<13:41:49, 114.67s/it]

([2, 0, 2], [2, 0, 2])
tensor(16.0766, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -1.0322049881494877e-08), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(16.0766, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -2.0644099762989754e-08), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(16.0766, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -3.096614875630621e-08), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(16.0766, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -4.128819952597951e-08), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(16.0766, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -5.1610250295652804e-08), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(16.0766, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -6.193229751261242e-08), (499999, 0.0)]
([2, 0, 2], [2, 0, 2])
tensor(16.0766, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (45

 41%|████      | 296/725 [10:18:06<19:10:12, 160.87s/it]

test：1.0, test mean: 0.3194444444444444
([5, 2, 1], [5, 2, 1])
tensor(18.7129, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.0007144180126488209)]


 41%|████      | 297/725 [10:18:33<14:19:47, 120.53s/it]

([2, 1, 1], [2, 1, 1])
tensor(24.1004, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 41%|████      | 298/725 [10:18:58<10:55:45, 92.14s/it] 

([2, 0, 2], [2, 7, 2])
tensor(9.6537, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 41%|████      | 299/725 [10:19:24<8:32:56, 72.24s/it] 

([2, 3, 1], [2, 3, 1])
tensor(23.8660, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 41%|████▏     | 300/725 [10:19:50<6:52:02, 58.17s/it]

([1, 5, 1], [1, 5, 1])
tensor(11.7446, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 42%|████▏     | 301/725 [10:20:15<5:40:56, 48.25s/it]

([0, 7, 7], [0, 7, 7])
tensor(6.5215, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, 1.3852265112745954e-07), (380927, 2.9181528091430664), (499999, 0.0)]
([0, 7, 7], [0, 7, 7])
tensor(3.3025, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -3.806930948258014e-08), (380927, 53.94455337524414), (499999, 0.0)]
([0, 7, 7], [0, 7, 7])
tensor(2.5779, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -0.0006767534650862217), (380927, 71.45274353027344), (499999, 0.0)]
([0, 0, 7], [0, 7, 7])
tensor(1.9836, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -0.0006759103271178901), (380927, 71.47409057617188), (499999, 0.0)]
([0, 0, 7], [0, 7, 7])
tensor(1.9836, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -0.0005422834074124694), (380927, 71.49542236328125), (499999, 0.0)]
([0, 0, 7], [0, 7, 7])
tensor(1.9836, device='cuda:0', grad_fn=<NllLossBackward0>)
[(417791, -0.0005422713002189994), (380927, 71.51676940917969), (499999, 0.0)]
([0, 0, 7], [0, 7, 7])
tens

 42%|████▏     | 302/725 [10:24:35<13:08:12, 111.80s/it]

test：0.5, test mean: 0.3211009174311927
([0, 6, 1], [0, 6, 1])
tensor(6.9218, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 42%|████▏     | 303/725 [10:25:00<10:02:47, 85.70s/it] 

([2, 1, 0], [2, 1, 1])
tensor(19.7780, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -3.231491064070724e-06)]


 42%|████▏     | 304/725 [10:25:25<7:55:04, 67.71s/it] 

([7, 2, 2], [7, 2, 2])
tensor(26.6920, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(22.0344, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.336997985839844), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(22.1252, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.329551696777344), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(17.8529, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.52182960510254), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(17.7708, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.521257400512695), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(17.7708, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.52068519592285), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(17.7708, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.520112991333008), (499999, 0.0), (4

 42%|████▏     | 305/725 [10:29:48<14:43:00, 126.14s/it]

test：0.0, test mean: 0.3181818181818182
([0, 1, 3], [0, 1, 3])
tensor(12.1217, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 42%|████▏     | 306/725 [10:30:12<11:08:11, 95.68s/it] 

([2, 3, 2], [2, 3, 2])
tensor(16.4560, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 42%|████▏     | 307/725 [10:30:39<8:41:43, 74.89s/it] 

([1, 0, 2], [1, 0, 2])
tensor(15.6444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 1.1474887287477031e-05), (499999, 0.0)]
([1, 0, 2], [1, 0, 2])
tensor(15.6444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00012940783926751465), (499999, 0.0)]
([1, 0, 2], [1, 0, 2])
tensor(15.6444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00012935022823512554), (499999, 0.0)]
([1, 0, 2], [1, 0, 2])
tensor(15.6444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00012929394142702222), (499999, 0.0)]
([1, 0, 2], [1, 0, 2])
tensor(15.6444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00012923787289764732), (499999, 0.0)]
([1, 0, 2], [1, 0, 2])
tensor(15.6444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00012919396976940334), (499999, 0.0)]
([1, 0, 2], [1, 0, 2])
tensor(15.6444, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (43007

 42%|████▏     | 308/725 [10:35:06<15:21:34, 132.60s/it]

test：1.0, test mean: 0.32432432432432434
([1, 7, 1], [1, 7, 1])
tensor(21.4254, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(19.4412, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 21.694992065429688), (499999, 0.0)]
([1, 7, 1], [1, 7, 1])
tensor(17.6809, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 13.650177001953125), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(15.0972, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 13.57441520690918), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(15.0970, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 13.515434265136719), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(15.0969, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 13.470806121826172), (499999, 0.0)]
([1, 0, 1], [1, 7, 1])
tensor(15.0969, device='cuda:0', grad_fn=<NllLossBackward0>)
[(4999

 43%|████▎     | 309/725 [10:39:37<20:08:04, 174.24s/it]

test：0.0, test mean: 0.32142857142857145
([0, 3, 1], [0, 3, 1])
tensor(11.4100, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -2.108750152274297e-08), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.4100, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -1.0069646805277443e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.4100, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -1.761569734526347e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.4100, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -2.5045062557182973e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.4100, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.247442350584606e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.4100, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -3.9903790138851036e-07), (499999, 0.0), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(11.4100, device='cuda:0', grad_f

 43%|████▎     | 310/725 [10:44:04<23:15:57, 201.83s/it]

test：1.0, test mean: 0.3274336283185841
([1, 1, 8], [1, 1, 8])
tensor(15.8060, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 43%|████▎     | 311/725 [10:44:28<17:06:09, 148.72s/it]

([2, 1, 2], [2, 1, 2])
tensor(27.2609, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -7.4367570877075195), (499999, 0.0)]
([2, 1, 2], [2, 1, 2])
tensor(27.3562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -13.653286933898926), (499999, 0.0)]
([2, 1, 2], [2, 1, 2])
tensor(27.3562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -19.869815826416016), (499999, 0.0)]
([2, 1, 2], [2, 1, 2])
tensor(27.3562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -26.086347579956055), (499999, 0.0)]
([2, 1, 2], [2, 1, 2])
tensor(27.3562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -32.30287551879883), (499999, 0.0)]
([2, 1, 2], [2, 1, 2])
tensor(27.3562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -38.519405364990234), (499999, 0.0)]
([2, 1, 2], [2, 1, 2])
tensor(27.3562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, -44.735939025878

 43%|████▎     | 312/725 [10:48:58<21:14:02, 185.09s/it]

test：1.0, test mean: 0.3333333333333333
([7, 7, 7], [7, 7, 7])
tensor(21.5798, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (380927, 2.9181621074676514), (380927, 2.7659730911254883)]
([7, 7, 7], [7, 7, 7])
tensor(13.5345, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.855499267578125), (380927, 9.089452743530273), (380927, 33.681663513183594)]
([7, 7, 7], [7, 7, 7])
tensor(7.7036, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.120274543762207), (380927, 5.499117851257324), (380927, 154.61181640625)]
([0, 0, 0], [7, 7, 7])
tensor(0.0170, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.119187355041504), (380927, 5.495545387268066), (380927, 155.57431030273438)]
([0, 0, 0], [7, 7, 7])
tensor(0.0097, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.118098258972168), (380927, 5.491968154907227), (380927, 156.11764526367188)]
([0, 0, 0], [7, 7, 7])
tensor(0.0051, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.117

 43%|████▎     | 313/725 [10:53:47<24:44:15, 216.15s/it]

test：0.0, test mean: 0.33043478260869563
([0, 7, 1], [0, 7, 1])
tensor(6.2529, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 43%|████▎     | 314/725 [10:54:11<18:05:37, 158.48s/it]

([7, 6, 6], [7, 6, 6])
tensor(14.1721, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.918154716491699), (499999, 0.0), (499999, 0.0)]
([7, 6, 6], [7, 6, 6])
tensor(10.4233, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 54.28713607788086), (499999, 0.0), (499999, 0.0)]
([7, 6, 6], [7, 6, 6])
tensor(8.9293, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 54.422080993652344), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [7, 6, 6])
tensor(7.6085, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 55.057003021240234), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [7, 6, 6])
tensor(7.6066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 55.29606628417969), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [7, 6, 6])
tensor(7.6063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 55.46034240722656), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [7, 6, 6])
tensor(7.6061, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 55.591400146484375), (499999, 0.0), (499999, 

 43%|████▎     | 315/725 [10:58:40<21:49:47, 191.68s/it]

test：0.0, test mean: 0.3275862068965517
([2, 2, 1], [2, 2, 1])
tensor(31.1827, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.010405882261693478), (499999, 0.0)]


 44%|████▎     | 316/725 [10:59:07<16:09:49, 142.27s/it]

([6, 2, 0], [6, 2, 0])
tensor(11.1974, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 44%|████▎     | 317/725 [10:59:30<12:03:44, 106.43s/it]

([2, 2, 6], [2, 2, 6])
tensor(18.3512, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 44%|████▍     | 318/725 [10:59:54<9:13:46, 81.64s/it]  

([7, 2, 2], [7, 2, 2])
tensor(27.6866, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(24.4057, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 17.661983489990234), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(23.7803, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.921613693237305), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(18.8881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 28.71799659729004), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(18.7903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 28.706832885742188), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(18.7903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 28.697214126586914), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(18.7903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 28.691925048828125), (499999, 0.0), (

 44%|████▍     | 319/725 [11:04:23<15:32:46, 137.85s/it]

test：0.0, test mean: 0.3247863247863248
([5, 1, 8], [5, 1, 5])
tensor(14.8604, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 44%|████▍     | 320/725 [11:04:47<11:40:46, 103.82s/it]

([8, 1, 5], [8, 1, 5])
tensor(15.8879, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.005630332976579666), (499999, 0.0)]


 44%|████▍     | 321/725 [11:05:12<8:59:50, 80.17s/it]  

([1, 7, 2], [1, 7, 2])
tensor(20.7857, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(18.3131, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 21.846389770507812), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(17.9394, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 20.228849411010742), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(13.7889, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 23.616943359375), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(13.7486, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 23.61443519592285), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(13.7486, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 23.611923217773438), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(13.7486, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 23.60941505432129), (4999

 44%|████▍     | 322/725 [11:10:08<16:13:00, 144.87s/it]

test：0.0, test mean: 0.3220338983050847
([7, 2, 0], [7, 2, 0])
tensor(19.0179, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(15.7015, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.932491302490234), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(14.3778, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.42215871810913086), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(10.1607, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.42178964614868164), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(10.1607, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.4220442771911621), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(10.1607, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.42229795455932617), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(10.1607, device='cuda:0', grad_fn=<NllLossBackward0>)
[(38

 45%|████▍     | 323/725 [11:14:47<20:40:26, 185.14s/it]

test：0.0, test mean: 0.31932773109243695
([2, 0, 0], [2, 0, 0])
tensor(14.3184, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 45%|████▍     | 324/725 [11:15:11<15:14:02, 136.77s/it]

([0, 3, 6], [0, 3, 6])
tensor(7.5225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.068208414129913e-05), (499999, 0.0), (499999, 0.0)]
([0, 3, 6], [0, 3, 6])
tensor(7.5225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 3.265531995566562e-05), (499999, 0.0), (499999, 0.0)]
([0, 3, 6], [0, 3, 6])
tensor(7.5225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.0014085291186347604), (499999, 0.0), (499999, 0.0)]
([0, 3, 6], [0, 3, 6])
tensor(7.5225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.0014200033619999886), (499999, 0.0), (499999, 0.0)]
([0, 3, 6], [0, 3, 6])
tensor(7.5225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.001428298419341445), (499999, 0.0), (499999, 0.0)]
([0, 3, 6], [0, 3, 6])
tensor(7.5225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -0.001432549674063921), (499999, 0.0), (499999, 0.0)]
([0, 3, 6], [0, 3, 6])
tensor(7.5394, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -2.114248752593994), (4

 45%|████▍     | 325/725 [11:20:04<20:23:35, 183.54s/it]

test：1.0, test mean: 0.325
([7, 7, 2], [7, 7, 2])
tensor(15.6469, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 45%|████▍     | 326/725 [11:20:31<15:08:32, 136.62s/it]

([0, 1, 2], [0, 1, 2])
tensor(15.5046, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 1.178352748709699e-09), (499999, 0.0), (499999, 0.0)]


 45%|████▌     | 327/725 [11:20:57<11:27:07, 103.59s/it]

([0, 1, 3], [0, 1, 3])
tensor(8.9745, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.016709527000784874), (499999, 0.0)]


 45%|████▌     | 328/725 [11:21:25<8:55:40, 80.96s/it]  

([7, 0, 2], [7, 0, 2])
tensor(13.7768, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(11.8169, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.462310791015625), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 0, 2])
tensor(9.9972, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.830215454101562), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(7.4568, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.124526977539062), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(7.4563, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.340763092041016), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(7.4557, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.44356918334961), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 0, 2])
tensor(7.4557, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.545059204101562), (499999, 0.0), (49999

 45%|████▌     | 329/725 [11:26:15<15:48:05, 143.65s/it]

test：0.0, test mean: 0.32231404958677684
([7, 0, 0], [7, 0, 0])
tensor(1.0243, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -1.9638331139049114e-07), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.0243, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -4.658625130105065e-07), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.0243, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -9.99466919893166e-07), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.0243, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -3.626500756581663e-06), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.0243, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 6.880168712086743e-06), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.0243, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 7.0061246333352756e-06), (499999, 0.0)]
([7, 0, 0], [7, 0, 0])
tensor(1.0243, device='cuda:0', grad_fn=<NllLossB

 46%|████▌     | 330/725 [11:30:59<20:22:15, 185.66s/it]

test：1.0, test mean: 0.32786885245901637
([7, 7, 2], [7, 7, 2])
tensor(22.7256, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (380927, 2.7659730911254883), (499999, 0.0)]
([7, 7, 2], [7, 7, 2])
tensor(18.2423, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 22.4570369720459), (380927, 25.111251831054688), (499999, 0.0)]
([7, 7, 2], [7, 7, 2])
tensor(12.3968, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 133.491455078125), (380927, 124.8648910522461), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(8.4350, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 133.4173126220703), (380927, 125.20709991455078), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(8.4211, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 133.38912963867188), (380927, 125.21638488769531), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(8.4164, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 133.36734008789062), (380927, 125.20443725585938), (499999, 0.0)]
([0, 0, 

 46%|████▌     | 331/725 [11:35:46<23:38:02, 215.95s/it]

test：0.0, test mean: 0.3252032520325203
([8, 2, 0], [8, 2, 0])
tensor(15.9328, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -1.6596683281022706e-06)]


 46%|████▌     | 332/725 [11:36:13<17:23:26, 159.30s/it]

([8, 6, 1], [8, 6, 1])
tensor(14.5055, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 46%|████▌     | 333/725 [11:36:41<13:03:03, 119.86s/it]

([8, 8, 2], [8, 8, 2])
tensor(23.2325, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0034195035696029663), (499999, 0.0), (499999, 0.0)]


 46%|████▌     | 334/725 [11:37:09<10:01:45, 92.34s/it] 

([2, 3, 1], [2, 7, 1])
tensor(22.0991, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.052751339972019196)]


 46%|████▌     | 335/725 [11:37:38<7:58:04, 73.55s/it] 

([7, 3, 0], [7, 3, 5])
tensor(9.9089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.9181618690490723), (499999, 0.0), (233471, 0.00025848724180832505)]
([7, 3, 0], [7, 3, 5])
tensor(5.7233, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 46.232643127441406), (499999, 0.0), (233471, 0.004992397036403418)]
([7, 3, 0], [7, 3, 5])
tensor(5.7924, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 35.09180450439453), (499999, 0.0), (233471, 0.0016008296515792608)]
([0, 3, 0], [7, 3, 5])
tensor(3.7912, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 57.71773910522461), (499999, 0.0), (233471, 0.0016010781982913613)]
([0, 3, 0], [7, 3, 5])
tensor(3.7241, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 57.71771240234375), (499999, 0.0), (233471, 0.0015975970309227705)]
([0, 3, 0], [7, 3, 5])
tensor(3.7241, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 57.717681884765625), (499999, 0.0), (233471, 0.0015346857253462076)]
([0, 3, 0], [7, 3, 5])
tensor(3

 46%|████▋     | 336/725 [11:43:06<16:10:07, 149.63s/it]

test：0.0, test mean: 0.3225806451612903
([2, 1, 2], [2, 1, 2])
tensor(25.3514, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 46%|████▋     | 337/725 [11:43:38<12:19:23, 114.34s/it]

([1, 2, 0], [1, 2, 0])
tensor(17.8140, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 338/725 [11:44:05<9:29:24, 88.28s/it]  

([7, 7, 0], [7, 7, 0])
tensor(12.0536, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (380927, 2.9423112869262695), (499999, 0.0)]
([7, 7, 0], [7, 7, 0])
tensor(9.2734, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 16.432491302490234), (380927, 20.091007232666016), (499999, 0.0)]
([7, 7, 0], [7, 7, 0])
tensor(5.1840, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.019277095794678), (380927, 9.259359359741211), (499999, 0.0)]
([0, 0, 0], [7, 7, 0])
tensor(0.0078, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 4.988507270812988), (380927, 8.434396743774414), (499999, 0.0)]
([0, 0, 0], [7, 7, 0])
tensor(0.0044, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 4.958494186401367), (380927, 7.976251602172852), (499999, 0.0)]
([0, 0, 0], [7, 7, 0])
tensor(0.0037, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 4.928476333618164), (380927, 7.588134765625), (499999, 0.0)]
([0, 0, 0], [7, 7, 0])
tensor(0.0031, device='cuda:0', g

 47%|████▋     | 339/725 [11:48:35<15:18:57, 142.84s/it]

test：0.0, test mean: 0.32
([1, 2, 1], [1, 2, 1])
tensor(19.7832, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 340/725 [11:49:00<11:28:52, 107.36s/it]

([1, 1, 2], [1, 1, 2])
tensor(24.5394, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 341/725 [11:49:26<8:51:01, 82.97s/it]  

([2, 1, 0], [2, 1, 0])
tensor(18.3980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 342/725 [11:49:53<7:02:28, 66.18s/it]

([2, 0, 8], [2, 0, 8])
tensor(15.1673, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -13.138886451721191)]
([2, 0, 8], [2, 0, 8])
tensor(15.6831, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -42.1973876953125)]
([2, 0, 8], [2, 0, 8])
tensor(15.6783, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -70.00808715820312)]
([2, 0, 8], [2, 0, 8])
tensor(15.0006, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -94.00593566894531)]
([2, 0, 8], [2, 0, 8])
tensor(14.4158, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -120.59185028076172)]
([2, 0, 8], [2, 0, 8])
tensor(14.0169, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -147.64187622070312)]
([2, 0, 8], [2, 0, 8])
tensor(13.5617, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -174

 47%|████▋     | 343/725 [11:54:46<14:14:05, 134.15s/it]

test：0.0, test mean: 0.31746031746031744
([1, 1, 0], [1, 1, 0])
tensor(13.6405, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 47%|████▋     | 344/725 [11:55:14<10:49:45, 102.32s/it]

([7, 5, 2], [7, 5, 2])
tensor(17.7685, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 3.167205572128296), (499999, 0.0), (499999, 0.0)]
([7, 5, 2], [7, 5, 2])
tensor(15.5078, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 38.357452392578125), (499999, 0.0), (499999, 0.0)]
([3, 5, 2], [7, 5, 2])
tensor(14.7221, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 17.264188766479492), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(13.3878, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.634408950805664), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(13.3109, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.302247047424316), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(13.3105, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 11.00710678100586), (499999, 0.0), (499999, 0.0)]
([0, 5, 2], [7, 5, 2])
tensor(13.3101, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.746417999267578), (499999, 0.0), (4

 48%|████▊     | 345/725 [12:00:24<17:22:52, 164.66s/it]

test：0.0, test mean: 0.31496062992125984
([2, 2, 8], [2, 2, 8])
tensor(23.0359, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 48%|████▊     | 346/725 [12:00:52<13:01:03, 123.65s/it]

([2, 2, 1], [2, 2, 1])
tensor(28.5530, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 48%|████▊     | 347/725 [12:01:21<10:00:45, 95.36s/it] 

([0, 6, 6], [0, 6, 6])
tensor(4.6153, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.07544729858636856), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [0, 6, 6])
tensor(4.6066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.0841664969921112), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [0, 6, 6])
tensor(4.6066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.08416881412267685), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [0, 6, 6])
tensor(4.6066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.08417113125324249), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [0, 6, 6])
tensor(4.6066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.08417344093322754), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [0, 6, 6])
tensor(4.6066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.08417575806379318), (499999, 0.0), (499999, 0.0)]
([0, 6, 6], [0, 6, 6])
tensor(4.6066, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 0.08417806774377823), (499999, 0.0), (

 48%|████▊     | 348/725 [12:06:09<16:02:05, 153.12s/it]

test：1.0, test mean: 0.3203125
([7, 0, 2], [7, 1, 2])
tensor(13.4059, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 1, 2])
tensor(11.2677, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.49880599975586), (499999, 0.0), (499999, 0.0)]
([7, 0, 2], [7, 1, 2])
tensor(9.3039, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.203079223632812), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 1, 2])
tensor(7.2598, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.203277587890625), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 1, 2])
tensor(7.2598, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.20347785949707), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 1, 2])
tensor(7.2598, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.203678131103516), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [7, 1, 2])
tensor(7.2598, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.203824996

 48%|████▊     | 349/725 [12:10:43<19:46:36, 189.35s/it]

test：0.0, test mean: 0.3178294573643411
([2, 7, 5], [2, 7, 5])
tensor(14.7660, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9423112869262695), (499999, 0.0)]
([2, 7, 5], [2, 7, 5])
tensor(13.8349, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 20.00885772705078), (499999, 0.0)]
([2, 7, 5], [2, 7, 5])
tensor(11.5735, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 9.781723022460938), (499999, 0.0)]
([2, 0, 5], [2, 7, 5])
tensor(8.9383, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 9.175792694091797), (499999, 0.0)]
([2, 0, 5], [2, 7, 5])
tensor(8.9375, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 8.632865905761719), (499999, 0.0)]
([2, 0, 5], [2, 7, 5])
tensor(8.9360, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 8.199321746826172), (499999, 0.0)]
([2, 0, 5], [2, 7, 5])
tensor(8.9357, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0),

 48%|████▊     | 350/725 [12:15:35<22:55:27, 220.07s/it]

test：0.0, test mean: 0.3153846153846154
([2, 6, 7], [2, 6, 7])
tensor(17.7043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.7659730911254883)]
([2, 6, 7], [2, 6, 7])
tensor(16.4305, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 24.84676742553711)]
([2, 6, 7], [2, 6, 7])
tensor(12.8839, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 98.8154525756836)]
([2, 6, 0], [2, 6, 7])
tensor(11.8870, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 98.81005859375)]
([2, 6, 0], [2, 6, 7])
tensor(11.8870, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 98.80465698242188)]
([2, 6, 0], [2, 6, 7])
tensor(11.8869, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 98.81195831298828)]
([2, 6, 0], [2, 6, 7])
tensor(11.8869, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0),

 48%|████▊     | 351/725 [12:20:32<25:16:09, 243.23s/it]

test：0.0, test mean: 0.31297709923664124
([1, 2, 5], [1, 2, 5])
tensor(19.7921, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 49%|████▊     | 352/725 [12:20:59<18:28:20, 178.29s/it]

([6, 5, 6], [6, 5, 6])
tensor(9.6008, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.0011975764064118266), (499999, 0.0)]


 49%|████▊     | 353/725 [12:21:24<13:40:35, 132.35s/it]

([3, 0, 5], [3, 0, 5])
tensor(8.8481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 49%|████▉     | 354/725 [12:21:50<10:20:32, 100.36s/it]

([1, 8, 8], [1, 8, 8])
tensor(15.2859, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -13.144383430480957), (499999, 0.0)]
([1, 8, 8], [1, 8, 8])
tensor(15.7571, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -37.25322723388672), (499999, 0.0)]
([1, 8, 8], [1, 8, 8])
tensor(15.7363, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -65.23672485351562), (499999, 0.0)]
([1, 8, 8], [1, 8, 8])
tensor(14.9134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -90.51957702636719), (499999, 0.0)]
([1, 8, 8], [1, 8, 8])
tensor(13.9720, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -115.71737670898438), (499999, 0.0)]
([1, 1, 8], [1, 8, 8])
tensor(13.3014, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -127.64697265625), (499999, 0.0)]
([1, 8, 8], [1, 8, 8])
tensor(12.6293, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -131.4192657470703), 

 49%|████▉     | 355/725 [12:26:34<15:58:46, 155.48s/it]

test：0.0, test mean: 0.3106060606060606
([8, 5, 0], [8, 5, 0])
tensor(10.6181, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.00019623333355411887), (499999, 0.0)]


 49%|████▉     | 356/725 [12:26:58<11:53:43, 116.05s/it]

([8, 7, 2], [8, 7, 2])
tensor(21.7103, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([8, 7, 2], [8, 7, 2])
tensor(18.4743, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 18.719202041625977), (499999, 0.0)]
([8, 7, 2], [8, 7, 2])
tensor(17.6963, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 15.070271492004395), (499999, 0.0)]
([8, 7, 2], [8, 7, 2])
tensor(14.0702, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -19.210603713989258), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(12.9667, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -19.21381187438965), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(12.9667, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -19.217016220092773), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(12.9667, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -19.22022247314453)

 49%|████▉     | 357/725 [12:31:47<17:10:47, 168.06s/it]

test：0.0, test mean: 0.3082706766917293
([5, 1, 1], [5, 1, 1])
tensor(7.4781, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.000799795612692833), (499999, 0.0), (499999, 0.0)]


 49%|████▉     | 358/725 [12:32:14<12:48:05, 125.57s/it]

([0, 3, 1], [0, 3, 1])
tensor(11.2883, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -17.98503875732422), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(10.6604, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -52.2571907043457), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(10.5699, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -108.90302276611328), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(9.8670, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -139.088623046875), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(8.9473, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -169.94732666015625), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(8.3277, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -200.0457305908203), (499999, 0.0)]
([0, 3, 1], [0, 3, 1])
tensor(7.8660, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -227.68360900878906), (49

 50%|████▉     | 359/725 [12:37:11<18:00:43, 177.17s/it]

test：0.0, test mean: 0.30597014925373134
([1, 1, 2], [1, 1, 2])
tensor(17.8665, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 50%|████▉     | 360/725 [12:37:38<13:23:16, 132.04s/it]

([2, 7, 1], [2, 7, 1])
tensor(17.1205, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.942309617996216), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(15.8744, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 32.07612609863281), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(13.0686, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 18.1945743560791), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(12.4134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 18.194503784179688), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(12.4134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 18.19443702697754), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(12.4134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 18.194372177124023), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(12.4134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 18.19430160522461), (49999

 50%|████▉     | 361/725 [12:42:04<17:25:02, 172.26s/it]

test：0.0, test mean: 0.3037037037037037
([0, 7, 2], [0, 7, 2])
tensor(21.0289, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.00012129350216127932), (380927, 2.9181621074676514), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(19.0738, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.4038134217262268), (380927, 20.368528366088867), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(17.3021, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.4038247764110565), (380927, 22.393360137939453), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(14.6330, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.40382295846939087), (380927, 22.60991668701172), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(14.6323, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.4037238359451294), (380927, 22.7164249420166), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(14.6320, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.4036402702331543), (380927, 22.773937225341797), (499999, 0.0

 50%|████▉     | 362/725 [12:46:26<20:05:19, 199.23s/it]

test：0.5, test mean: 0.30514705882352944
([0, 2, 1], [6, 2, 1])
tensor(18.6867, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -2.1879361156607047e-05), (499999, 0.0), (499999, 0.0)]


 50%|█████     | 363/725 [12:46:53<14:50:01, 147.52s/it]

([5, 5, 3], [5, 5, 3])
tensor(11.8226, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.00681028189137578)]


 50%|█████     | 364/725 [12:47:18<11:05:58, 110.69s/it]

([0, 1, 2], [7, 1, 2])
tensor(10.5588, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 50%|█████     | 365/725 [12:47:43<8:31:14, 85.21s/it]  

([8, 8, 3], [8, 8, 3])
tensor(11.8151, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 50%|█████     | 366/725 [12:48:11<6:45:37, 67.79s/it]

([7, 2, 8], [7, 2, 8])
tensor(14.3599, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 51%|█████     | 367/725 [12:48:36<5:28:21, 55.03s/it]

([7, 7, 5], [7, 7, 5])
tensor(18.3117, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (380927, 2.9181621074676514), (499999, 0.0)]
([7, 7, 5], [7, 7, 5])
tensor(14.0153, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.643152236938477), (380927, 36.218162536621094), (499999, 0.0)]
([7, 7, 5], [7, 7, 5])
tensor(9.0911, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 120.45036315917969), (380927, 23.071334838867188), (499999, 0.0)]
([7, 0, 5], [7, 7, 5])
tensor(4.5341, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 116.5662841796875), (380927, 23.107421875), (499999, 0.0)]
([0, 0, 5], [7, 7, 5])
tensor(3.8472, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 117.27545166015625), (380927, 23.14350700378418), (499999, 0.0)]
([0, 0, 5], [7, 7, 5])
tensor(3.8354, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 117.53965759277344), (380927, 23.17831039428711), (499999, 0.0)]
([0, 0, 5], [7, 7, 5])
tensor(3.8322, device='cuda:0'

 51%|█████     | 368/725 [12:53:03<11:45:55, 118.64s/it]

test：0.0, test mean: 0.3029197080291971
([2, 2, 2], [2, 2, 2])
tensor(24.9981, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.001180042396299541), (499999, 0.0)]


 51%|█████     | 369/725 [12:53:29<8:59:11, 90.87s/it]  

([0, 8, 6], [0, 8, 6])
tensor(5.3880, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -1.0773047733891872e-06), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [0, 8, 6])
tensor(5.3880, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, 7.248564202200214e-07), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [0, 8, 6])
tensor(5.3880, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -4.992340109311044e-06), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [0, 8, 6])
tensor(5.4389, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -1.7866109609603882), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [0, 8, 6])
tensor(5.3880, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -1.7866178750991821), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [0, 8, 6])
tensor(5.3880, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -1.786620020866394), (499999, 0.0), (499999, 0.0)]
([0, 8, 6], [0, 8, 6])
tensor(5.3880, device='cuda:0', grad_fn=<NllLossBackward0>)
[(430079, -1.7866253852844238), (499999

 51%|█████     | 370/725 [12:57:54<14:07:20, 143.21s/it]

test：1.0, test mean: 0.3079710144927536
([3, 1, 2], [3, 1, 2])
tensor(18.0396, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.05233266204595566), (499999, 0.0), (499999, 0.0)]


 51%|█████     | 371/725 [12:58:19<10:35:20, 107.69s/it]

([2, 1, 2], [2, 1, 2])
tensor(30.3491, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 51%|█████▏    | 372/725 [12:58:45<8:08:32, 83.04s/it]  

([2, 7, 2], [2, 7, 2])
tensor(27.7734, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9423112869262695), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(26.0787, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 8.44124698638916), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(23.9754, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -3.422300338745117), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(19.5320, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 35.90736770629883), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(19.1942, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 35.26215362548828), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(19.1851, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 34.914119720458984), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(19.1809, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 34.638214111328125), (499

 51%|█████▏    | 373/725 [13:03:15<13:36:08, 139.12s/it]

test：0.0, test mean: 0.3057553956834532
([2, 1, 8], [2, 1, 8])
tensor(24.4404, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.10918749868869781)]


 52%|█████▏    | 374/725 [13:03:41<10:16:07, 105.32s/it]

([0, 8, 6], [0, 8, 6])
tensor(9.6233, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 375/725 [13:04:06<7:52:53, 81.07s/it]  

([8, 2, 0], [8, 2, 0])
tensor(17.2850, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -1.3046244021097664e-05)]


 52%|█████▏    | 376/725 [13:04:31<6:14:19, 64.35s/it]

([8, 2, 8], [8, 2, 8])
tensor(16.3063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 377/725 [13:04:56<5:04:44, 52.54s/it]

([5, 2, 2], [5, 2, 2])
tensor(19.2763, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 378/725 [13:05:20<4:13:58, 43.92s/it]

([2, 8, 1], [2, 8, 1])
tensor(18.7573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 379/725 [13:05:43<3:37:36, 37.74s/it]

([5, 1, 1], [5, 1, 1])
tensor(17.9727, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 52%|█████▏    | 380/725 [13:06:06<3:11:28, 33.30s/it]

([0, 8, 8], [0, 8, 8])
tensor(8.9093, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.17508664727210999)]


 53%|█████▎    | 381/725 [13:06:29<2:53:55, 30.34s/it]

([1, 1, 0], [1, 1, 0])
tensor(15.5608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -1.7518399545224383e-05)]
([1, 1, 0], [1, 1, 0])
tensor(15.5608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -3.515425487421453e-05)]
([1, 1, 0], [1, 1, 0])
tensor(15.5608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -5.278167736832984e-05)]
([1, 1, 0], [1, 1, 0])
tensor(15.5608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -4.7499641368631274e-05)]
([1, 1, 0], [1, 1, 0])
tensor(15.5608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -5.948098987573758e-05)]
([1, 1, 0], [1, 1, 0])
tensor(15.5608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -8.5242441855371e-05)]
([1, 1, 0], [1, 1, 0])
tensor(15.5608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (49999

 53%|█████▎    | 382/725 [13:11:13<10:07:21, 106.24s/it]

test：1.0, test mean: 0.3107142857142857
([5, 2, 6], [5, 2, 6])
tensor(15.4054, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.06495456397533417), (499999, 0.0), (499999, 0.0)]


 53%|█████▎    | 383/725 [13:11:37<7:45:12, 81.62s/it]  

([3, 1, 2], [3, 1, 2])
tensor(11.6260, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 2.6093288397532888e-05), (466943, -4.57085657119751), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(11.6510, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 5.2186576795065776e-05), (466943, -8.591010093688965), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(11.6510, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 7.827987428754568e-05), (466943, -12.634201049804688), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(11.6510, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00010437315359013155), (466943, -16.67739486694336), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(11.6510, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00013046644744463265), (466943, -20.720584869384766), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(11.6510, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00015655973402317613), (466943, -24.763776779174805), (499999, 0.0)]
([3, 1, 2], [3,

 53%|█████▎    | 384/725 [13:16:07<13:04:56, 138.11s/it]

test：1.0, test mean: 0.31560283687943264
([2, 2, 5], [2, 2, 3])
tensor(17.0314, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 53%|█████▎    | 385/725 [13:16:31<9:49:28, 104.03s/it] 

([2, 2, 8], [2, 2, 8])
tensor(25.2771, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 53%|█████▎    | 386/725 [13:16:54<7:30:25, 79.72s/it] 

([7, 1, 8], [7, 1, 8])
tensor(16.6890, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (499999, 0.0), (499999, 0.0)]
([7, 1, 8], [7, 1, 8])
tensor(14.8276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 9.980613708496094), (499999, 0.0), (499999, 0.0)]
([7, 1, 8], [7, 1, 8])
tensor(10.4416, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 50.755455017089844), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(8.6794, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 50.72892761230469), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(8.6793, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 50.70089340209961), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(8.6793, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 50.67285919189453), (499999, 0.0), (499999, 0.0)]
([0, 1, 8], [7, 1, 8])
tensor(8.6793, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 50.6476936340332), (499999, 0.0), (499999, 0

 53%|█████▎    | 387/725 [13:21:24<12:50:15, 136.73s/it]

test：0.0, test mean: 0.31338028169014087
([1, 7, 2], [1, 7, 2])
tensor(18.3276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.7872986793518066), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(18.3285, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.6739608645439148), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(15.7405, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -43.05971908569336), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(11.6133, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -43.08464431762695), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(11.6133, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -43.10912322998047), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(11.6133, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -43.1336555480957), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(11.6130, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49

 54%|█████▎    | 388/725 [13:26:22<17:19:38, 185.10s/it]

test：0.0, test mean: 0.3111888111888112
([3, 5, 1], [3, 5, 1])
tensor(12.2516, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▎    | 389/725 [13:26:49<12:50:41, 137.62s/it]

([0, 2, 3], [0, 2, 3])
tensor(14.4705, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (233471, -16.844614028930664)]
([0, 2, 3], [0, 2, 3])
tensor(13.8980, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (233471, -29.837011337280273)]
([0, 2, 3], [0, 2, 3])
tensor(12.4625, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (233471, -38.63356018066406)]
([0, 2, 3], [0, 2, 3])
tensor(11.2079, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (233471, -47.02641296386719)]
([0, 2, 3], [0, 2, 3])
tensor(10.6430, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (233471, -51.90013122558594)]
([0, 2, 0], [0, 2, 3])
tensor(10.3974, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (233471, -49.43671798706055)]
([0, 2, 0], [0, 2, 3])
tensor(10.2175, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (233471, -43.

 54%|█████▍    | 390/725 [13:32:15<18:03:23, 194.04s/it]

test：0.0, test mean: 0.3090277777777778
([1, 1, 8], [1, 1, 8])
tensor(18.4588, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▍    | 391/725 [13:32:42<13:22:31, 144.17s/it]

([8, 7, 2], [8, 7, 2])
tensor(17.0589, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.765972852706909), (499999, 0.0)]
([8, 7, 2], [8, 7, 2])
tensor(15.1204, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 44.117088317871094), (499999, 0.0)]
([8, 7, 2], [8, 7, 2])
tensor(12.8278, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 150.97015380859375), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(10.9373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 150.981201171875), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(10.9373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 150.99224853515625), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(10.9373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 151.00332641601562), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(10.9373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 151.01512145996094), (49

 54%|█████▍    | 392/725 [13:37:31<17:21:08, 187.59s/it]

test：0.0, test mean: 0.30689655172413793
([6, 0, 2], [6, 0, 2])
tensor(13.8209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▍    | 393/725 [13:37:58<12:50:39, 139.28s/it]

([1, 1, 2], [1, 1, 2])
tensor(21.2603, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▍    | 394/725 [13:38:24<9:41:43, 105.45s/it] 

([1, 2, 1], [1, 2, 1])
tensor(14.7513, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 54%|█████▍    | 395/725 [13:38:50<7:29:05, 81.65s/it] 

([6, 1, 1], [6, 1, 1])
tensor(17.0711, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 55%|█████▍    | 396/725 [13:39:16<5:55:55, 64.91s/it]

([7, 0, 5], [7, 0, 5])
tensor(11.2484, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9423112869262695), (331775, 2.9042257665423676e-07), (499999, 0.0)]
([7, 0, 5], [7, 0, 5])
tensor(9.5830, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.900809288024902), (331775, 0.0006162489298731089), (499999, 0.0)]
([7, 0, 5], [7, 0, 5])
tensor(7.6155, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 1.0011277198791504), (331775, 0.0006162773352116346), (499999, 0.0)]
([7, 0, 5], [7, 0, 5])
tensor(4.0331, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -28.04555892944336), (331775, 0.0021411175839602947), (499999, 0.0)]
([0, 0, 5], [7, 0, 5])
tensor(3.4512, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -28.022329330444336), (331775, 0.0021410533227026463), (499999, 0.0)]
([0, 0, 5], [7, 0, 5])
tensor(3.4512, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -27.999101638793945), (331775, 0.002188683021813631), (499999, 0.0)]
([0, 0, 5], [7, 0, 5])
te

 55%|█████▍    | 397/725 [13:43:48<11:34:36, 127.06s/it]

test：0.5, test mean: 0.3082191780821918
([2, 8, 2], [2, 8, 2])
tensor(22.9294, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 55%|█████▍    | 398/725 [13:44:13<8:45:37, 96.45s/it]  

([2, 2, 0], [2, 2, 0])
tensor(14.9917, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -0.043602026998996735)]
([2, 2, 0], [2, 2, 0])
tensor(14.9856, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -0.03476863354444504)]
([2, 2, 0], [2, 2, 0])
tensor(14.9855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -0.03045511059463024)]
([2, 2, 0], [2, 2, 0])
tensor(14.9855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -0.025439323857426643)]
([2, 2, 0], [2, 2, 0])
tensor(14.9855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -0.021852951496839523)]
([2, 2, 0], [2, 2, 0])
tensor(14.9855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (454655, -0.01912509649991989)]
([2, 2, 0], [2, 2, 0])
tensor(14.9855, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), 

 55%|█████▌    | 399/725 [13:48:46<13:31:40, 149.39s/it]

test：1.0, test mean: 0.3129251700680272
([3, 5, 8], [3, 5, 8])
tensor(15.1086, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 55%|█████▌    | 400/725 [13:49:11<10:06:48, 112.03s/it]

([1, 7, 0], [1, 7, 0])
tensor(7.8501, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -7.405117034912109), (380927, 2.7659730911254883), (499999, 0.0006051213131286204)]
([1, 7, 0], [1, 7, 0])
tensor(6.6831, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -26.144968032836914), (380927, 23.384143829345703), (499999, 0.0012102426262572408)]
([1, 7, 0], [1, 7, 0])
tensor(3.4068, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -44.88481903076172), (380927, 115.48860168457031), (499999, 0.0018153648125007749)]
([1, 0, 0], [1, 7, 0])
tensor(1.9445, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -63.62466812133789), (380927, 115.4781494140625), (499999, 0.0024204852525144815)]
([1, 0, 0], [1, 7, 0])
tensor(1.9445, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -82.36451721191406), (380927, 115.47765350341797), (499999, 0.003025607205927372)]
([1, 0, 0], [1, 7, 0])
tensor(1.9445, device='cuda:0', grad_fn=<NllLossBackward0>)
[(344063, -101.1043701171875), (3

 55%|█████▌    | 401/725 [13:53:59<14:49:10, 164.66s/it]

test：0.5, test mean: 0.3141891891891892
([1, 1, 2], [1, 1, 2])
tensor(20.1987, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 55%|█████▌    | 402/725 [13:54:26<11:04:20, 123.41s/it]

([8, 1, 3], [8, 1, 3])
tensor(13.4110, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 56%|█████▌    | 403/725 [13:54:52<8:25:20, 94.16s/it]  

([0, 2, 1], [0, 2, 1])
tensor(15.4775, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 56%|█████▌    | 404/725 [13:55:20<6:37:48, 74.36s/it]

([2, 0, 1], [2, 0, 1])
tensor(18.1363, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 1.1577863006095868e-06), (499999, 0.0)]


 56%|█████▌    | 405/725 [13:55:46<5:20:10, 60.03s/it]

([2, 8, 8], [2, 8, 8])
tensor(24.8187, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0014140980783849955), (499999, 0.0), (499999, 0.0)]


 56%|█████▌    | 406/725 [13:56:13<4:25:03, 49.85s/it]

([0, 7, 2], [0, 7, 2])
tensor(11.4436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 56%|█████▌    | 407/725 [13:56:39<3:46:25, 42.72s/it]

([5, 6, 8], [5, 6, 8])
tensor(14.9688, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 56%|█████▋    | 408/725 [13:57:04<3:17:28, 37.38s/it]

([3, 1, 7], [3, 1, 7])
tensor(14.0806, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 2.693244695663452)]
([3, 1, 7], [3, 1, 7])
tensor(10.8436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 5.149259567260742)]
([3, 1, 3], [3, 1, 7])
tensor(8.7329, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.4057068824768066)]
([3, 1, 0], [3, 1, 7])
tensor(7.4676, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.61557674407959)]
([3, 1, 0], [3, 1, 7])
tensor(7.4657, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.7331881523132324)]
([3, 1, 0], [3, 1, 7])
tensor(7.4639, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.7332048416137695)]
([3, 1, 0], [3, 1, 7])
tensor(7.4639, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 3.7332251071929

 56%|█████▋    | 409/725 [14:02:13<10:26:26, 118.95s/it]

test：0.0, test mean: 0.31208053691275167
([5, 7, 8], [5, 7, 8])
tensor(15.2089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 3.347373058204539e-05), (380927, 2.7659730911254883), (499999, 0.0)]
([5, 7, 8], [5, 7, 8])
tensor(13.8945, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 6.694746116409078e-05), (380927, 24.846778869628906), (499999, 0.0)]
([5, 7, 8], [5, 7, 8])
tensor(10.2442, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00010042123176390305), (380927, 103.47589111328125), (499999, 0.0)]
([5, 0, 8], [5, 7, 8])
tensor(9.1691, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00013389492232818156), (380927, 103.47174072265625), (499999, 0.0)]
([5, 0, 8], [5, 7, 8])
tensor(9.1691, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0001673686201684177), (380927, 103.46759796142578), (499999, 0.0)]
([5, 0, 8], [5, 7, 8])
tensor(9.1691, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00020084237621631473), (380927, 103.4634475708007

 57%|█████▋    | 410/725 [14:07:09<15:02:49, 171.97s/it]

test：0.0, test mean: 0.31
([5, 1, 8], [5, 1, 8])
tensor(16.2318, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 57%|█████▋    | 411/725 [14:07:35<11:12:15, 128.46s/it]

([1, 0, 2], [1, 0, 2])
tensor(13.9968, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 57%|█████▋    | 412/725 [14:08:04<8:33:12, 98.38s/it]  

([3, 2, 3], [3, 2, 3])
tensor(16.8329, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -17.164306640625)]
([3, 2, 3], [3, 2, 3])
tensor(16.5082, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -24.329038619995117)]
([3, 2, 0], [3, 2, 3])
tensor(14.8016, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -26.983137130737305)]
([3, 2, 0], [3, 2, 3])
tensor(14.6909, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -29.650760650634766)]
([3, 2, 0], [3, 2, 3])
tensor(14.6748, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -30.371307373046875)]
([3, 2, 0], [3, 2, 3])
tensor(14.6569, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -30.313589096069336)]
([3, 2, 0], [3, 2, 3])
tensor(14.6553, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (208895, -30

 57%|█████▋    | 413/725 [14:13:38<14:38:54, 169.02s/it]

test：0.0, test mean: 0.3079470198675497
([0, 6, 3], [0, 6, 5])
tensor(6.7251, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 57%|█████▋    | 414/725 [14:14:01<10:49:43, 125.35s/it]

([2, 2, 1], [2, 2, 1])
tensor(20.1764, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 57%|█████▋    | 415/725 [14:14:27<8:14:10, 95.65s/it]  

([1, 8, 1], [1, 8, 1])
tensor(14.2634, device='cuda:0', grad_fn=<NllLossBackward0>)


 57%|█████▋    | 416/725 [14:14:54<6:25:36, 74.88s/it]

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([0, 7, 0], [0, 0, 0])
tensor(0.2821, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 58%|█████▊    | 417/725 [14:15:19<5:07:41, 59.94s/it]

([2, 0, 1], [2, 0, 1])
tensor(15.1207, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 58%|█████▊    | 418/725 [14:15:45<4:14:28, 49.73s/it]

([1, 5, 2], [1, 5, 2])
tensor(14.3845, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.005311501212418079), (499999, 0.0), (499999, 0.0)]


 58%|█████▊    | 419/725 [14:16:10<3:36:09, 42.38s/it]

([5, 7, 0], [5, 7, 0])
tensor(5.8931, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 58%|█████▊    | 420/725 [14:16:35<3:09:17, 37.24s/it]

([0, 1, 5], [0, 1, 5])
tensor(9.4300, device='cuda:0', grad_fn=<NllLossBackward0>)


 58%|█████▊    | 421/725 [14:17:01<2:50:53, 33.73s/it]

[(499999, -5.959033842373174e-06), (499999, 0.0), (499999, 0.0)]
([7, 2, 7], [7, 2, 7])
tensor(21.3847, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 2, 7], [7, 2, 7])
tensor(18.2807, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 17.62541389465332), (499999, 0.0), (499999, 0.0)]
([7, 2, 7], [7, 2, 7])
tensor(17.4714, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 16.440204620361328), (499999, 0.0), (499999, 0.0)]
([7, 2, 7], [7, 2, 7])
tensor(13.8212, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.229330062866211), (499999, 0.0), (499999, 0.0)]
([0, 2, 7], [7, 2, 7])
tensor(12.8178, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.21417236328125), (499999, 0.0), (499999, 0.0)]
([0, 2, 7], [7, 2, 7])
tensor(12.8178, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.199012756347656), (499999, 0.0), (499999, 0.0)]
([0, 2, 7], [7, 2, 7])
tensor(12.8178, device='cuda:0', grad_fn=<Nl

 58%|█████▊    | 422/725 [14:21:33<8:51:23, 105.23s/it]

test：0.0, test mean: 0.3059210526315789
([2, 1, 2], [2, 1, 2])
tensor(27.0036, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 58%|█████▊    | 423/725 [14:22:02<6:54:49, 82.42s/it] 

([3, 2, 2], [3, 2, 2])
tensor(24.7038, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -17.981460571289062), (499999, 0.0), (499999, 0.0)]
([3, 2, 2], [3, 2, 2])
tensor(24.2080, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -52.251502990722656), (499999, 0.0), (499999, 0.0)]
([3, 2, 2], [3, 2, 2])
tensor(24.1175, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -108.88200378417969), (499999, 0.0), (499999, 0.0)]
([3, 2, 2], [3, 2, 2])
tensor(23.4157, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -138.9053497314453), (499999, 0.0), (499999, 0.0)]
([3, 2, 2], [3, 2, 2])
tensor(22.4939, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -155.9625701904297), (499999, 0.0), (499999, 0.0)]
([1, 2, 2], [3, 2, 2])
tensor(21.5820, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -186.06166076660156), (499999, 0.0), (499999, 0.0)]
([1, 2, 2], [3, 2, 2])
tensor(21.2944, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -212.0826416015625), (499999, 0.

 58%|█████▊    | 424/725 [14:26:33<11:37:43, 139.08s/it]

test：0.0, test mean: 0.30392156862745096
([1, 5, 8], [1, 5, 8])
tensor(8.2187, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 59%|█████▊    | 425/725 [14:27:00<8:46:19, 105.27s/it] 

([7, 8, 1], [7, 8, 1])
tensor(20.5161, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9369421005249023), (499999, 0.0), (499999, 0.0)]
([7, 8, 1], [7, 8, 1])
tensor(18.6532, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.471473693847656), (499999, 0.0), (499999, 0.0)]
([7, 8, 1], [7, 8, 1])
tensor(16.5404, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.814328193664551), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [7, 8, 1])
tensor(14.5792, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.842276573181152), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [7, 8, 1])
tensor(14.5792, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.870227336883545), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [7, 8, 1])
tensor(14.5792, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.898176670074463), (499999, 0.0), (499999, 0.0)]
([0, 8, 1], [7, 8, 1])
tensor(14.5792, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.926129341125488), (499999, 0.0), 

 59%|█████▉    | 426/725 [14:31:23<12:40:29, 152.61s/it]

test：0.0, test mean: 0.30194805194805197
([1, 1, 2], [1, 1, 2])
tensor(19.0613, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.005520861595869064), (499999, 0.0)]


 59%|█████▉    | 427/725 [14:31:49<9:29:18, 114.62s/it] 

([0, 0, 7], [0, 0, 7])
tensor(2.6212, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 59%|█████▉    | 428/725 [14:32:14<7:14:42, 87.82s/it] 

([3, 1, 6], [3, 1, 6])
tensor(11.2781, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 5.61631965637207), (499999, 0.0), (499999, 0.0)]
([1, 1, 6], [3, 1, 6])
tensor(10.8847, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 13.681492805480957), (499999, 0.0), (499999, 0.0)]
([1, 1, 6], [3, 1, 6])
tensor(11.2602, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 22.135841369628906), (499999, 0.0), (499999, 0.0)]
([1, 1, 6], [3, 1, 6])
tensor(10.3081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 22.54298210144043), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [3, 1, 6])
tensor(10.1143, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 31.802473068237305), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [3, 1, 6])
tensor(9.9838, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 32.515869140625), (499999, 0.0), (499999, 0.0)]
([0, 1, 6], [3, 1, 6])
tensor(9.9823, device='cuda:0', grad_fn=<NllLossBackward0>)
[(275966, 33.022789001464844), (499999, 0.0), (499999,

 59%|█████▉    | 429/725 [14:36:40<11:36:46, 141.24s/it]

test：0.0, test mean: 0.3
([7, 2, 1], [7, 2, 1])
tensor(29.1430, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [7, 2, 1])
tensor(26.0775, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 16.66072654724121), (499999, 0.0), (499999, 0.0)]
([7, 2, 1], [7, 2, 1])
tensor(23.2954, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 49.78679656982422), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(20.3585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 49.774444580078125), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(20.3585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 49.7620849609375), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(20.3585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 49.75232696533203), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [7, 2, 1])
tensor(20.3585, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 49.7425613403320

 59%|█████▉    | 430/725 [14:41:08<14:42:05, 179.41s/it]

test：0.0, test mean: 0.2980769230769231
([5, 1, 7], [5, 1, 7])
tensor(14.2751, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 59%|█████▉    | 431/725 [14:41:34<10:52:40, 133.20s/it]

([0, 8, 2], [0, 8, 2])
tensor(12.0627, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 60%|█████▉    | 432/725 [14:41:59<8:12:22, 100.83s/it] 

([2, 6, 0], [2, 6, 7])
tensor(11.9483, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 60%|█████▉    | 433/725 [14:42:24<6:19:49, 78.05s/it] 

([0, 5, 1], [0, 5, 1])
tensor(11.2566, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 60%|█████▉    | 434/725 [14:42:50<5:02:32, 62.38s/it]

([1, 2, 2], [1, 2, 2])
tensor(23.5526, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 60%|██████    | 435/725 [14:43:16<4:08:42, 51.46s/it]

([2, 1, 2], [2, 1, 2])
tensor(24.7449, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 60%|██████    | 436/725 [14:43:41<3:30:04, 43.62s/it]

([5, 0, 7], [5, 0, 7])
tensor(10.9962, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 1.3095508620608598e-05), (380927, 2.9181621074676514)]
([5, 0, 7], [5, 0, 7])
tensor(8.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00024109004880301654), (380927, 14.155105590820312)]
([5, 0, 7], [5, 0, 7])
tensor(6.9502, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00024283910170197487), (380927, 9.747808456420898)]
([5, 0, 0], [5, 0, 7])
tensor(3.9063, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.0002421056997263804), (380927, 9.769806861877441)]
([5, 0, 0], [5, 0, 7])
tensor(3.9062, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00020065723219886422), (380927, 9.782953262329102)]
([5, 0, 0], [5, 0, 7])
tensor(3.9062, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.00020072855113539845), (380927, 9.796099662780762)]
([5, 0, 0], [5, 0, 7])
ten

 60%|██████    | 437/725 [14:48:39<9:35:21, 119.87s/it]

test：0.5, test mean: 0.29936305732484075
([2, 7, 7], [2, 7, 7])
tensor(18.2165, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([2, 7, 7], [2, 7, 7])
tensor(14.7711, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 18.71919822692871), (499999, 0.0)]
([2, 7, 7], [2, 7, 7])
tensor(14.2416, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 16.40452766418457), (499999, 0.0)]
([2, 7, 7], [2, 7, 7])
tensor(10.6003, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.733366012573242), (499999, 0.0)]
([2, 0, 7], [2, 7, 7])
tensor(9.5206, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.726385593414307), (499999, 0.0)]
([2, 0, 7], [2, 7, 7])
tensor(9.5206, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 4.719413757324219), (499999, 0.0)]
([2, 0, 7], [2, 7, 7])
tensor(9.5206, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0

 60%|██████    | 438/725 [14:53:12<13:12:44, 165.73s/it]

test：0.0, test mean: 0.2974683544303797
([1, 5, 1], [1, 5, 1])
tensor(12.8385, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 439/725 [14:53:37<9:48:57, 123.56s/it] 

([1, 2, 0], [1, 2, 0])
tensor(17.8611, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 440/725 [14:54:02<7:27:12, 94.15s/it] 

([0, 5, 0], [0, 5, 0])
tensor(3.8230, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 441/725 [14:54:29<5:49:54, 73.92s/it]

([2, 2, 2], [2, 2, 2])
tensor(34.4821, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 442/725 [14:54:54<4:39:11, 59.19s/it]

([0, 7, 2], [1, 7, 2])
tensor(13.9070, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 443/725 [14:55:18<3:49:12, 48.77s/it]

([2, 6, 0], [2, 6, 1])
tensor(10.8094, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████    | 444/725 [14:55:43<3:15:15, 41.69s/it]

([1, 1, 1], [1, 1, 1])
tensor(11.6052, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 61%|██████▏   | 445/725 [14:56:09<2:51:30, 36.75s/it]

([2, 0, 7], [2, 0, 7])
tensor(15.8686, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9181621074676514)]
([2, 0, 7], [2, 0, 7])
tensor(12.4868, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 17.590667724609375)]
([2, 0, 7], [2, 0, 7])
tensor(11.9036, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 14.870207786560059)]
([2, 0, 7], [2, 0, 7])
tensor(8.2241, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.435666084289551)]
([2, 0, 0], [2, 0, 7])
tensor(6.9999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.4379143714904785)]
([2, 0, 0], [2, 0, 7])
tensor(6.9999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.440166473388672)]
([2, 0, 0], [2, 0, 7])
tensor(6.9999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -7.442413

 62%|██████▏   | 446/725 [15:01:00<8:46:28, 113.22s/it]

test：0.0, test mean: 0.29559748427672955
([8, 8, 2], [8, 8, 2])
tensor(21.0294, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 447/725 [15:01:25<6:41:38, 86.69s/it] 

([7, 1, 5], [7, 1, 5])
tensor(10.0206, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 448/725 [15:01:50<5:14:51, 68.20s/it]

([6, 1, 2], [6, 1, 2])
tensor(18.0804, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 449/725 [15:02:15<4:13:37, 55.14s/it]

([1, 2, 1], [1, 2, 1])
tensor(25.7740, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.009891888126730919), (499999, 0.0)]


 62%|██████▏   | 450/725 [15:02:40<3:32:14, 46.31s/it]

([1, 7, 6], [1, 7, 6])
tensor(16.9877, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -0.5570147037506104), (499999, 0.0)]
([1, 7, 6], [1, 7, 6])
tensor(16.8994, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -2.174717903137207), (499999, 0.0)]
([1, 7, 6], [1, 7, 6])
tensor(11.1850, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.5780081748962402), (499999, 0.0)]
([1, 7, 6], [1, 7, 6])
tensor(10.8426, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -13.015602111816406), (499999, 0.0)]
([1, 3, 6], [1, 7, 6])
tensor(10.0999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 129.9805908203125), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(9.0453, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 129.8026885986328), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(9.0312, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 129.749755859375), (499

 62%|██████▏   | 451/725 [15:07:24<8:56:20, 117.45s/it]

test：0.0, test mean: 0.29375
([2, 2, 0], [2, 2, 0])
tensor(20.7036, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 452/725 [15:07:50<6:49:30, 90.00s/it] 

([8, 2, 3], [8, 2, 3])
tensor(15.5060, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 62%|██████▏   | 453/725 [15:08:16<5:21:32, 70.93s/it]

([2, 3, 2], [2, 3, 2])
tensor(20.2937, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.03468162193894386), (499999, 0.0)]


 63%|██████▎   | 454/725 [15:08:43<4:20:05, 57.59s/it]

([1, 7, 2], [1, 7, 2])
tensor(19.4770, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 63%|██████▎   | 455/725 [15:09:09<3:37:02, 48.23s/it]

([5, 0, 2], [5, 0, 2])
tensor(15.3539, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 63%|██████▎   | 456/725 [15:09:35<3:06:12, 41.53s/it]

([0, 2, 3], [0, 2, 3])
tensor(15.7671, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, -3.1065639632288367e-05), (499999, 0.0), (208895, -17.191287994384766)]
([0, 2, 3], [0, 2, 3])
tensor(15.4148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, -5.924570723436773e-05), (499999, 0.0), (208895, -33.318687438964844)]
([0, 2, 3], [0, 2, 3])
tensor(14.1261, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, -8.742577483644709e-05), (499999, 0.0), (208895, -48.23539733886719)]
([0, 2, 3], [0, 2, 3])
tensor(12.8047, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, -0.00011560584243852645), (499999, 0.0), (208895, -45.152252197265625)]
([0, 2, 0], [0, 2, 3])
tensor(12.2653, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, -0.0001437859027646482), (499999, 0.0), (208895, -48.37701416015625)]
([0, 2, 0], [0, 2, 3])
tensor(12.1804, device='cuda:0', grad_fn=<NllLossBackward0>)
[(393215, -0.00017196597764268517), (499999, 0.0), (208895, -48.202293395996094)]
([0, 2, 0

 63%|██████▎   | 457/725 [15:15:10<9:37:59, 129.40s/it]

test：0.5, test mean: 0.2950310559006211
([0, 7, 2], [0, 7, 2])
tensor(16.2646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.9181528091430664), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(12.4937, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 51.08605194091797), (499999, 0.0)]
([0, 7, 2], [0, 7, 2])
tensor(10.3098, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 32.92583084106445), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(9.5490, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 32.92584228515625), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(9.5490, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 32.92585754394531), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(9.5490, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 32.925865173339844), (499999, 0.0)]
([0, 0, 2], [0, 7, 2])
tensor(9.5490, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0)

 63%|██████▎   | 458/725 [15:19:42<12:46:17, 172.20s/it]

test：0.0, test mean: 0.2932098765432099
([0, 0, 8], [0, 0, 8])
tensor(5.5702, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 1.0291244507243391e-05), (499999, 0.0)]


 63%|██████▎   | 459/725 [15:20:06<9:26:52, 127.87s/it] 

([2, 1, 1], [2, 1, 1])
tensor(20.3874, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -7.4389448165893555), (499999, 0.0)]
([2, 1, 1], [2, 1, 1])
tensor(20.4209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -13.656715393066406), (499999, 0.0)]
([2, 1, 1], [2, 1, 1])
tensor(20.4209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -19.87448501586914), (499999, 0.0)]
([2, 1, 1], [2, 1, 1])
tensor(20.4209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -26.09225845336914), (499999, 0.0)]
([2, 1, 1], [2, 1, 1])
tensor(20.4209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -32.310028076171875), (499999, 0.0)]
([2, 1, 1], [2, 1, 1])
tensor(20.4209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -38.52779769897461), (499999, 0.0)]
([2, 1, 1], [2, 1, 1])
tensor(20.4209, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -44.74556732177734

 63%|██████▎   | 460/725 [15:24:41<12:40:05, 172.09s/it]

test：1.0, test mean: 0.29754601226993865
([2, 7, 0], [2, 7, 0])
tensor(15.6402, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -1.1880149641818605e-10)]


 64%|██████▎   | 461/725 [15:25:07<9:24:12, 128.23s/it] 

([6, 1, 3], [6, 1, 3])
tensor(18.7998, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 64%|██████▎   | 462/725 [15:25:33<7:07:35, 97.55s/it] 

([1, 2, 0], [1, 2, 0])
tensor(19.4034, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 64%|██████▍   | 463/725 [15:25:58<5:30:16, 75.63s/it]

([8, 7, 2], [8, 7, 2])
tensor(23.9772, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([8, 7, 2], [8, 7, 2])
tensor(20.5875, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 12.86850357055664), (499999, 0.0)]
([8, 7, 2], [8, 7, 2])
tensor(19.5344, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 0.45883655548095703), (499999, 0.0)]
([8, 7, 2], [8, 7, 2])
tensor(15.3245, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.575748920440674), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(15.0122, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.5629820823669434), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(15.0122, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.5502147674560547), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(15.0122, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.5374507904052734), (

 64%|██████▍   | 464/725 [15:30:49<10:10:35, 140.37s/it]

test：0.0, test mean: 0.29573170731707316
([8, 1, 1], [8, 1, 1])
tensor(17.6324, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 64%|██████▍   | 465/725 [15:31:16<7:40:42, 106.32s/it] 

([8, 2, 7], [8, 2, 7])
tensor(13.8421, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.0058426205068826675), (499999, 0.0), (499999, 0.0)]


 64%|██████▍   | 466/725 [15:31:38<5:50:05, 81.10s/it] 

([0, 8, 2], [0, 8, 2])
tensor(18.1173, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 64%|██████▍   | 467/725 [15:32:02<4:34:51, 63.92s/it]

([0, 1, 2], [0, 1, 2])
tensor(16.4406, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 65%|██████▍   | 468/725 [15:32:30<3:47:55, 53.21s/it]

([2, 1, 1], [2, 1, 1])
tensor(23.8809, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.005318725947290659), (499999, 0.024088073521852493)]


 65%|██████▍   | 469/725 [15:32:56<3:11:50, 44.96s/it]

([0, 6, 7], [0, 6, 7])
tensor(11.6259, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.030018553137779236), (499999, 0.0), (380927, 2.7659730911254883)]
([0, 6, 7], [0, 6, 7])
tensor(9.4952, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.03434496745467186), (499999, 0.0), (380927, 10.125085830688477)]
([0, 6, 7], [0, 6, 7])
tensor(5.4566, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.034349337220191956), (499999, 0.0), (380927, 49.629539489746094)]
([0, 6, 0], [0, 6, 7])
tensor(3.2837, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.03435371071100235), (499999, 0.0), (380927, 49.5538330078125)]
([0, 6, 0], [0, 6, 7])
tensor(3.2831, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.034358080476522446), (499999, 0.0), (380927, 49.50535583496094)]
([0, 6, 0], [0, 6, 7])
tensor(3.2829, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.03436245024204254), (499999, 0.0), (380927, 49.4520378112793)]
([0, 6, 0], [0, 6, 7])
tensor(3.2829, dev

 65%|██████▍   | 470/725 [15:37:47<8:24:20, 118.67s/it]

test：0.5, test mean: 0.296969696969697
([7, 2, 0], [7, 2, 0])
tensor(17.9485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9423112869262695), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(16.1490, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.900809288024902), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(14.2808, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.4924321174621582), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(10.4449, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -32.71236038208008), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(9.7751, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -32.643524169921875), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(9.7741, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -32.68640899658203), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(9.7741, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927,

 65%|██████▍   | 471/725 [15:42:09<11:25:18, 161.88s/it]

test：0.0, test mean: 0.29518072289156627
([7, 2, 0], [7, 2, 0])
tensor(17.3975, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.942309617996216), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(15.1069, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 14.431379318237305), (499999, 0.0), (499999, 0.0)]
([7, 2, 0], [7, 2, 0])
tensor(12.5986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.865626335144043), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(10.8075, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.863463401794434), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(10.8075, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.861299514770508), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(10.8075, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -8.859139442443848), (499999, 0.0), (499999, 0.0)]
([0, 2, 0], [7, 2, 0])
tensor(10.8075, device='cuda:0', grad_fn=<NllLossBackward0>)
[(3686

 65%|██████▌   | 472/725 [15:46:31<13:29:29, 191.97s/it]

test：0.0, test mean: 0.2934131736526946
([6, 8, 1], [6, 8, 1])
tensor(17.4426, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 65%|██████▌   | 473/725 [15:46:58<9:57:58, 142.37s/it] 

([2, 2, 1], [2, 2, 1])
tensor(22.9233, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 65%|██████▌   | 474/725 [15:47:23<7:28:40, 107.25s/it]

([0, 1, 2], [0, 1, 2])
tensor(15.5888, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 66%|██████▌   | 475/725 [15:47:49<5:44:36, 82.71s/it] 

([2, 8, 2], [2, 8, 2])
tensor(25.0526, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 66%|██████▌   | 476/725 [15:48:14<4:31:19, 65.38s/it]

([2, 2, 8], [2, 2, 1])
tensor(16.9492, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 66%|██████▌   | 477/725 [15:48:37<3:37:52, 52.71s/it]

([1, 2, 6], [1, 2, 6])
tensor(19.6601, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 66%|██████▌   | 478/725 [15:49:01<3:01:49, 44.17s/it]

([7, 8, 2], [7, 8, 2])
tensor(14.6183, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -11.817911148071289), (499999, 0.0)]
([7, 8, 2], [7, 8, 2])
tensor(15.0297, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -37.45065689086914), (499999, 0.0)]
([7, 8, 2], [7, 8, 2])
tensor(15.0297, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -63.083709716796875), (499999, 0.0)]
([7, 8, 2], [7, 8, 2])
tensor(15.0297, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -88.71676635742188), (499999, 0.0)]
([7, 8, 2], [7, 8, 2])
tensor(15.0297, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -114.34981536865234), (499999, 0.0)]
([7, 8, 2], [7, 8, 2])
tensor(15.0297, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -139.98287963867188), (499999, 0.0)]
([7, 8, 2], [7, 8, 2])
tensor(15.0297, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -165.615921020507

 66%|██████▌   | 479/725 [15:53:41<7:51:21, 114.97s/it]

test：1.0, test mean: 0.2976190476190476
([2, 1, 1], [2, 1, 1])
tensor(21.5153, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 66%|██████▌   | 480/725 [15:54:04<5:55:45, 87.12s/it] 

([2, 0, 3], [2, 0, 3])
tensor(13.5678, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 3.080901660723612e-05), (499999, 0.0)]


 66%|██████▋   | 481/725 [15:54:28<4:37:34, 68.26s/it]

([0, 7, 8], [0, 7, 8])
tensor(11.2810, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([0, 7, 8], [0, 7, 8])
tensor(9.3720, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 15.880879402160645), (499999, 0.0)]
([0, 7, 8], [0, 7, 8])
tensor(7.2081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 14.969651222229004), (499999, 0.0)]
([0, 0, 8], [0, 7, 8])
tensor(5.1075, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 14.969324111938477), (499999, 0.0)]
([0, 0, 8], [0, 7, 8])
tensor(5.1075, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 14.969000816345215), (499999, 0.0)]
([0, 0, 8], [0, 7, 8])
tensor(5.1075, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 14.968673706054688), (499999, 0.0)]
([0, 0, 8], [0, 7, 8])
tensor(5.1075, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 14.96834945678711), (499999

 66%|██████▋   | 482/725 [15:58:59<8:43:32, 129.27s/it]

test：0.0, test mean: 0.2958579881656805
([0, 2, 2], [0, 2, 2])
tensor(18.1348, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 1.8777092236632598e-07), (499999, 0.0), (499999, -0.009563562460243702)]


 67%|██████▋   | 483/725 [15:59:24<6:34:29, 97.81s/it] 

([5, 2, 5], [5, 2, 5])
tensor(17.3623, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 67%|██████▋   | 484/725 [15:59:49<5:05:02, 75.94s/it]

([5, 0, 0], [5, 0, 0])
tensor(4.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -1.3688677427126095e-05)]
([5, 0, 0], [5, 0, 0])
tensor(4.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, 0.0001445141388103366)]
([5, 0, 0], [5, 0, 0])
tensor(4.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, 0.00014227398787625134)]
([5, 0, 0], [5, 0, 0])
tensor(4.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, 0.00013912495342083275)]
([5, 0, 0], [5, 0, 0])
tensor(4.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, 0.00013751770893577486)]
([5, 0, 0], [5, 0, 0])
tensor(4.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, 0.00013602469698525965)]
([5, 0, 0], [5, 0, 0])
tensor(4.1308, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0)

 67%|██████▋   | 485/725 [16:04:31<9:11:26, 137.86s/it]

test：1.0, test mean: 0.3
([6, 0, 0], [6, 0, 0])
tensor(3.5726, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 67%|██████▋   | 486/725 [16:04:56<6:54:06, 103.96s/it]

([0, 0, 6], [0, 0, 6])
tensor(2.4812, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 67%|██████▋   | 487/725 [16:05:22<5:19:20, 80.51s/it] 

([7, 8, 8], [7, 8, 8])
tensor(22.9973, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 8, 8], [7, 8, 8])
tensor(20.8499, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.368518829345703), (499999, 0.0), (499999, 0.0)]
([7, 8, 8], [7, 8, 8])
tensor(19.1738, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 19.216123580932617), (499999, 0.0), (499999, 0.0)]
([0, 8, 8], [7, 8, 8])
tensor(16.5169, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 18.91876220703125), (499999, 0.0), (499999, 0.0)]
([0, 8, 8], [7, 8, 8])
tensor(16.5165, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 18.679607391357422), (499999, 0.0), (499999, 0.0)]
([0, 8, 8], [7, 8, 8])
tensor(16.5164, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 18.455951690673828), (499999, 0.0), (499999, 0.0)]
([0, 8, 8], [7, 8, 8])
tensor(16.5163, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 18.24911117553711), (499999, 0.0), (4

 67%|██████▋   | 488/725 [16:09:59<9:11:12, 139.55s/it]

test：0.0, test mean: 0.2982456140350877
([1, 1, 2], [1, 1, 2])
tensor(20.9281, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 67%|██████▋   | 489/725 [16:10:24<6:54:12, 105.31s/it]

([2, 2, 1], [2, 2, 1])
tensor(24.9420, device='cuda:0', grad_fn=<NllLossBackward0>)


 68%|██████▊   | 490/725 [16:10:51<5:20:10, 81.75s/it] 

[(499999, 0.0), (499999, 0.0), (499999, 0.00536485156044364)]
([1, 0, 1], [1, 0, 1])
tensor(12.7907, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -0.00036231096601113677), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(12.7909, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.06191345676779747), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(12.7905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.061913155019283295), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(12.7905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.06192617118358612), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(12.7905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.06192602589726448), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(12.7905, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, 0.06192588433623314), (499999, 0.0)]
([1, 0, 1], [1, 0, 1])
tensor(12.7905, device='cuda:0', g

 68%|██████▊   | 491/725 [16:15:40<9:20:54, 143.82s/it]

test：1.0, test mean: 0.3023255813953488
([2, 2, 6], [2, 2, 6])
tensor(28.2179, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 68%|██████▊   | 492/725 [16:16:05<7:00:37, 108.32s/it]

([0, 2, 7], [7, 2, 7])
tensor(18.9191, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -0.733353853225708)]
([0, 2, 7], [7, 2, 7])
tensor(18.9210, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 7.207763195037842)]
([0, 2, 7], [7, 2, 7])
tensor(17.3651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 20.903865814208984)]
([0, 2, 8], [7, 2, 7])
tensor(18.2577, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -5.976827621459961)]
([0, 2, 7], [7, 2, 7])
tensor(13.4951, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -44.090614318847656)]
([0, 2, 0], [7, 2, 7])
tensor(12.5402, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -44.09061813354492)]
([0, 2, 0], [7, 2, 7])
tensor(12.5402, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -44.09

 68%|██████▊   | 493/725 [16:21:08<10:44:45, 166.75s/it]

test：0.0, test mean: 0.30057803468208094
([1, 8, 0], [1, 8, 0])
tensor(10.7005, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 1.8118444131687284e-05)]
([1, 8, 0], [1, 8, 0])
tensor(10.7005, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.4474756072741e-05)]
([1, 8, 0], [1, 8, 0])
tensor(10.7005, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.447477790061384e-05)]
([1, 8, 0], [1, 8, 0])
tensor(10.7005, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.44748070044443e-05)]
([1, 8, 0], [1, 8, 0])
tensor(10.7005, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.447483610827476e-05)]
([1, 8, 0], [1, 8, 0])
tensor(10.7005, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, -6.44748579361476e-05)]
([1, 8, 0], [1, 8, 0])
tensor(10.7005, device='cuda:0', grad_fn=<NllL

 68%|██████▊   | 494/725 [16:26:01<13:07:14, 204.48s/it]

test：1.0, test mean: 0.3045977011494253
([2, 0, 1], [2, 0, 1])
tensor(19.5135, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 68%|██████▊   | 495/725 [16:26:26<9:37:13, 150.58s/it] 

([2, 0, 0], [2, 7, 0])
tensor(11.1042, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -3.3897023854478903e-07)]


 68%|██████▊   | 496/725 [16:26:51<7:10:46, 112.87s/it]

([1, 3, 7], [1, 3, 7])
tensor(12.3628, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 3.3441262245178223), (499999, 0.0)]
([1, 1, 7], [1, 3, 7])
tensor(11.4560, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 12.80059814453125), (499999, 0.0)]
([1, 0, 7], [1, 3, 7])
tensor(10.6276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 12.800603866577148), (499999, 0.0)]
([1, 0, 7], [1, 3, 7])
tensor(10.6276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 12.800646781921387), (499999, 0.0)]
([1, 0, 7], [1, 3, 7])
tensor(10.6276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 12.800646781921387), (499999, 0.0)]
([1, 0, 7], [1, 3, 7])
tensor(10.6276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 12.80064582824707), (499999, 0.0)]
([1, 0, 7], [1, 3, 7])
tensor(10.6276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (244514, 12.800658226013184), (4

 69%|██████▊   | 497/725 [16:31:30<10:19:05, 162.92s/it]

test：0.0, test mean: 0.3028571428571429
([2, 3, 1], [2, 7, 1])
tensor(15.4465, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.033223845064640045)]


 69%|██████▊   | 498/725 [16:31:56<7:40:04, 121.61s/it] 

([8, 7, 5], [8, 7, 5])
tensor(15.9536, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9423112869262695), (499999, 0.0)]
([8, 7, 5], [8, 7, 5])
tensor(14.1670, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 8.378218650817871), (499999, 0.0)]
([8, 7, 5], [8, 7, 5])
tensor(12.0513, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 1.9585695266723633), (499999, 0.0)]
([8, 0, 5], [8, 7, 5])
tensor(7.7593, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.986594200134277), (499999, 0.0)]
([8, 0, 5], [8, 7, 5])
tensor(7.6839, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.989049434661865), (499999, 0.0)]
([8, 0, 5], [8, 7, 5])
tensor(7.6839, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.95027494430542), (499999, 0.0)]
([8, 0, 5], [8, 7, 5])
tensor(7.6836, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.951297760009766), (499999, 0

 69%|██████▉   | 499/725 [16:36:29<10:29:17, 167.07s/it]

test：0.0, test mean: 0.30113636363636365
([2, 2, 1], [2, 2, 1])
tensor(24.2115, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.021160071715712547)]


 69%|██████▉   | 500/725 [16:36:53<7:46:23, 124.37s/it] 

([2, 7, 5], [2, 7, 1])
tensor(14.4270, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 69%|██████▉   | 501/725 [16:37:19<5:53:15, 94.62s/it] 

([0, 2, 0], [0, 2, 0])
tensor(7.4682, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 69%|██████▉   | 502/725 [16:37:44<4:34:53, 73.96s/it]

([2, 6, 7], [2, 6, 7])
tensor(13.3728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 69%|██████▉   | 503/725 [16:38:09<3:39:21, 59.28s/it]

([2, 1, 0], [2, 1, 0])
tensor(15.3246, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 70%|██████▉   | 504/725 [16:38:34<3:00:29, 49.00s/it]

([5, 8, 7], [5, 8, 7])
tensor(11.8919, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -13.117998123168945), (380927, 2.7659735679626465)]
([5, 8, 7], [5, 8, 7])
tensor(10.0250, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -40.950096130371094), (380927, 43.33906173706055)]
([5, 8, 7], [5, 8, 7])
tensor(7.4603, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -70.10079956054688), (380927, 121.92668914794922)]
([5, 8, 0], [5, 8, 7])
tensor(6.4345, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -96.91999816894531), (380927, 121.9267807006836)]
([5, 8, 0], [5, 8, 7])
tensor(5.6671, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -135.34873962402344), (380927, 121.9268798828125)]
([5, 8, 0], [5, 8, 7])
tensor(4.8561, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, -164.45248413085938), (380927, 121.92696380615234)]
([5, 8, 0], [5, 8, 7])
tensor(4.5353, devic

 70%|██████▉   | 505/725 [16:43:26<7:25:57, 121.62s/it]

test：0.0, test mean: 0.2994350282485876
([5, 8, 0], [5, 8, 0])
tensor(9.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -2.224463742095395e-06)]
([5, 8, 0], [5, 8, 0])
tensor(9.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 0.0040441942401230335)]
([5, 8, 0], [5, 8, 0])
tensor(9.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 0.004050913266837597)]
([5, 8, 0], [5, 8, 0])
tensor(9.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 0.00404908275231719)]
([5, 8, 0], [5, 8, 0])
tensor(9.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 0.004048258997499943)]
([5, 8, 0], [5, 8, 0])
tensor(9.6750, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, 0.004047399386763573)]
([5, 8, 0], [5, 8, 0])
tensor(9.6750, device='cuda:0', grad_fn=<NllLossBackward0>)

 70%|██████▉   | 506/725 [16:48:02<10:14:03, 168.23s/it]

test：1.0, test mean: 0.30337078651685395
([7, 0, 1], [7, 0, 1])
tensor(9.0806, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.8715732097625732), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(7.4716, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 18.11704444885254), (499999, 0.0), (499999, 0.0)]
([3, 0, 1], [7, 0, 1])
tensor(9.6050, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -17.99230194091797), (499999, 0.0), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(8.0674, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -47.249000549316406), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(7.3999, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -46.548423767089844), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(7.3949, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -46.3891487121582), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(7.3947, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4

 70%|██████▉   | 507/725 [16:52:30<11:59:37, 198.06s/it]

test：0.0, test mean: 0.3016759776536313
([0, 2, 7], [0, 2, 7])
tensor(20.4753, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -6.921525709913112e-06), (499999, -0.0014140980783849955), (380927, 2.9423112869262695)]
([0, 2, 7], [0, 2, 7])
tensor(18.9029, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.0011937974486500025), (499999, -0.002828196156769991), (380927, 12.900809288024902)]
([0, 2, 7], [0, 2, 7])
tensor(16.9541, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.001198368612676859), (499999, -0.004242294002324343), (380927, -0.4706144332885742)]
([0, 2, 7], [0, 2, 7])
tensor(13.1717, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.0011984480079263449), (499999, -0.005656392313539982), (380927, -30.964982986450195)]
([0, 2, 0], [0, 2, 7])
tensor(12.6091, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 0.0011943059507757425), (499999, -0.007070490159094334), (380927, -30.988428115844727)]
([0, 2, 0], [0, 2, 7])
tensor(12.6088, device='cuda

 70%|███████   | 508/725 [16:57:16<13:31:30, 224.38s/it]

test：0.5, test mean: 0.30277777777777776
([1, 0, 2], [1, 0, 2])
tensor(13.4087, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.008770739659667015)]


 70%|███████   | 509/725 [16:57:41<9:52:16, 164.52s/it] 

([2, 5, 0], [2, 2, 0])
tensor(12.6584, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, -0.0022238362580537796)]
([2, 5, 0], [2, 2, 0])
tensor(12.6635, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.7566589117050171)]
([2, 5, 0], [2, 2, 0])
tensor(12.6572, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.7558379173278809)]
([2, 5, 0], [2, 2, 0])
tensor(12.6572, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.7558085918426514)]
([2, 5, 0], [2, 2, 0])
tensor(12.6572, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.7551464438438416)]
([2, 5, 0], [2, 2, 0])
tensor(12.6572, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.7545159459114075)]
([2, 5, 0], [2, 2, 0])
tensor(12.6572, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, 0.

 70%|███████   | 510/725 [17:02:33<12:07:19, 202.97s/it]

test：1.0, test mean: 0.30662983425414364
([2, 8, 2], [2, 8, 2])
tensor(27.0116, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 70%|███████   | 511/725 [17:02:58<8:52:29, 149.30s/it] 

([1, 2, 2], [1, 2, 2])
tensor(21.2255, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 71%|███████   | 512/725 [17:03:22<6:37:03, 111.85s/it]

([2, 3, 0], [2, 3, 0])
tensor(10.0104, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -17.14579963684082), (499999, 0.0)]
([2, 3, 0], [2, 3, 0])
tensor(9.8609, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -28.424776077270508), (499999, 0.0)]
([2, 3, 0], [2, 3, 0])
tensor(8.0932, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -23.249940872192383), (499999, 0.0)]
([2, 0, 0], [2, 3, 0])
tensor(7.3713, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -27.4600830078125), (499999, 0.0)]
([2, 0, 0], [2, 3, 0])
tensor(7.3159, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -27.35862159729004), (499999, 0.0)]
([2, 0, 0], [2, 3, 0])
tensor(7.3161, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -27.429405212402344), (499999, 0.0)]
([2, 0, 0], [2, 3, 0])
tensor(7.3155, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (393215, -27.491697311401367), (49

 71%|███████   | 513/725 [17:08:02<9:33:18, 162.26s/it]

test：0.0, test mean: 0.30494505494505497
([1, 5, 1], [1, 5, 1])
tensor(10.6601, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 71%|███████   | 514/725 [17:08:26<7:05:06, 120.88s/it]

([2, 2, 2], [2, 2, 2])
tensor(20.1707, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.001180045772343874), (499999, 0.0), (499999, 0.0)]


 71%|███████   | 515/725 [17:08:50<5:21:24, 91.83s/it] 

([1, 7, 6], [1, 7, 6])
tensor(20.4834, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([1, 7, 6], [1, 7, 6])
tensor(15.9008, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 19.33700942993164), (499999, 0.0)]
([1, 7, 6], [1, 7, 6])
tensor(15.9169, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.32965087890625), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(11.6277, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 15.401861190795898), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(11.5180, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 15.40202522277832), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(11.5180, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 15.402186393737793), (499999, 0.0)]
([1, 0, 6], [1, 7, 6])
tensor(11.5180, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 15.402345657348633), (49

 71%|███████   | 516/725 [17:13:28<8:34:05, 147.58s/it]

test：0.0, test mean: 0.30327868852459017
([2, 8, 7], [2, 8, 7])
tensor(13.4885, device='cuda:0', grad_fn=<NllLossBackward0>)


 71%|███████▏  | 517/725 [17:13:51<6:22:34, 110.36s/it]

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([1, 2, 0], [1, 2, 0])
tensor(14.7199, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 71%|███████▏  | 518/725 [17:14:16<4:52:11, 84.69s/it] 

([1, 2, 1], [1, 2, 8])
tensor(21.1014, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 72%|███████▏  | 519/725 [17:14:41<3:49:24, 66.82s/it]

([7, 1, 2], [7, 1, 2])
tensor(27.1749, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(23.9939, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 12.833767890930176), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(22.7896, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.3614997863769531), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(18.7239, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.020853996276855), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(18.4648, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 7.998472690582275), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(18.4648, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 7.9760870933532715), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(18.4648, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 7.953706741333008), (499999, 0.0), (49

 72%|███████▏  | 520/725 [17:19:10<7:15:10, 127.37s/it]

test：0.0, test mean: 0.3016304347826087
([0, 7, 8], [0, 7, 8])
tensor(11.4169, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.0030600540339946747), (380927, 2.9369421005249023), (499999, 0.0)]
([0, 7, 8], [0, 7, 8])
tensor(9.6800, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.036511942744255066), (380927, 19.524059295654297), (499999, 0.0)]
([0, 7, 8], [0, 7, 8])
tensor(7.6557, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.03483608365058899), (380927, 9.846150398254395), (499999, 0.0)]
([0, 0, 8], [0, 7, 8])
tensor(5.7597, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.034804314374923706), (380927, 9.719038963317871), (499999, 0.0)]
([0, 0, 8], [0, 7, 8])
tensor(5.7596, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.03477254882454872), (380927, 9.60334300994873), (499999, 0.0)]
([0, 0, 8], [0, 7, 8])
tensor(5.7594, device='cuda:0', grad_fn=<NllLossBackward0>)
[(479231, 0.03474077582359314), (380927, 9.512325286865234), (499999, 0.0)]


 72%|███████▏  | 521/725 [17:23:40<9:39:00, 170.29s/it]

test：0.5, test mean: 0.3027027027027027
([5, 1, 1], [5, 1, 1])
tensor(16.7250, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 72%|███████▏  | 522/725 [17:24:07<7:10:23, 127.21s/it]

([7, 7, 2], [7, 7, 2])
tensor(25.7969, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.918153762817383), (380927, 2.9181621074676514), (499999, 0.0)]
([7, 7, 2], [7, 7, 2])
tensor(19.6978, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 51.04530715942383), (380927, 17.31088638305664), (499999, 0.0)]
([7, 7, 2], [7, 7, 2])
tensor(15.8061, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 34.296321868896484), (380927, 0.4389495849609375), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(12.0447, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 34.296409606933594), (380927, -1.4575715065002441), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(12.0373, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 34.29650115966797), (380927, -2.509270191192627), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(12.0320, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 34.29658126831055), (380927, -3.0315518379211426), (499999, 0.0)]
([0, 0, 2], [7, 7, 2])
tensor(12.0301, devi

 72%|███████▏  | 523/725 [17:28:33<9:28:05, 168.74s/it]

test：0.0, test mean: 0.3010752688172043
([1, 2, 7], [1, 2, 7])
tensor(24.3575, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.7659730911254883)]
([1, 2, 7], [1, 2, 7])
tensor(22.5559, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 8.494735717773438)]
([1, 2, 7], [1, 2, 7])
tensor(17.6776, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 55.72071838378906)]
([1, 2, 0], [1, 2, 7])
tensor(16.3094, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 55.57640075683594)]
([1, 2, 0], [1, 2, 7])
tensor(16.3091, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 55.436981201171875)]
([1, 2, 0], [1, 2, 7])
tensor(16.3088, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 55.3131217956543)]
([1, 2, 0], [1, 2, 7])
tensor(16.3082, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0

 72%|███████▏  | 524/725 [17:33:17<11:21:23, 203.40s/it]

test：0.0, test mean: 0.2994652406417112
([1, 2, 7], [1, 2, 7])
tensor(19.3538, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 72%|███████▏  | 525/725 [17:33:41<8:18:45, 149.63s/it] 

([1, 0, 8], [1, 1, 8])
tensor(13.8493, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.002396487630903721), (499999, 0.0)]


 73%|███████▎  | 526/725 [17:34:05<6:10:35, 111.73s/it]

([2, 2, 3], [2, 2, 3])
tensor(26.1375, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -17.18981170654297)]
([2, 2, 3], [2, 2, 3])
tensor(25.8044, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -28.51227378845215)]
([2, 2, 3], [2, 2, 3])
tensor(24.1937, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -23.921411514282227)]
([2, 2, 3], [2, 2, 3])
tensor(22.6694, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -21.51107406616211)]
([2, 2, 0], [2, 2, 3])
tensor(21.8049, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -16.55907440185547)]
([2, 2, 0], [2, 2, 3])
tensor(21.7600, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -16.55658721923828)]
([2, 2, 0], [2, 2, 3])
tensor(21.7594, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -16.5

 73%|███████▎  | 527/725 [17:38:43<8:53:36, 161.70s/it]

test：0.0, test mean: 0.2978723404255319
([1, 2, 2], [1, 2, 2])
tensor(31.4947, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 73%|███████▎  | 528/725 [17:39:08<6:36:19, 120.71s/it]

([1, 5, 2], [1, 5, 2])
tensor(21.0170, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 73%|███████▎  | 529/725 [17:39:32<4:59:32, 91.70s/it] 

([5, 2, 0], [5, 2, 0])
tensor(14.2117, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 73%|███████▎  | 530/725 [17:39:55<3:51:03, 71.09s/it]

([1, 1, 2], [1, 1, 2])
tensor(24.5402, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 73%|███████▎  | 531/725 [17:40:20<3:04:50, 57.17s/it]

([2, 8, 1], [2, 8, 1])
tensor(19.5102, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 73%|███████▎  | 532/725 [17:40:45<2:32:52, 47.53s/it]

([2, 1, 3], [2, 1, 3])
tensor(24.2511, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -17.19186782836914)]
([2, 1, 3], [2, 1, 3])
tensor(23.8546, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -28.004234313964844)]
([2, 1, 3], [2, 1, 3])
tensor(22.0505, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -35.43011474609375)]
([2, 1, 3], [2, 1, 3])
tensor(20.5792, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -41.030670166015625)]
([2, 1, 3], [2, 1, 3])
tensor(19.9121, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -47.34236526489258)]
([2, 1, 3], [2, 1, 3])
tensor(19.3756, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -55.304691314697266)]
([2, 1, 0], [2, 1, 3])
tensor(19.1770, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (466943, -47

 74%|███████▎  | 533/725 [17:45:12<6:03:25, 113.57s/it]

test：0.0, test mean: 0.2962962962962963
([3, 2, 1], [3, 2, 1])
tensor(16.2432, device='cuda:0', grad_fn=<NllLossBackward0>)
[(208895, -16.971399307250977), (499999, 0.0), (499999, -0.0006260767113417387)]
([3, 2, 1], [3, 2, 1])
tensor(15.9579, device='cuda:0', grad_fn=<NllLossBackward0>)
[(208895, -27.459793090820312), (499999, 0.0), (499999, -0.0012521534226834774)]
([0, 2, 1], [3, 2, 1])
tensor(14.8226, device='cuda:0', grad_fn=<NllLossBackward0>)
[(208895, -28.296180725097656), (499999, 0.0), (499999, -0.001878230250440538)]
([0, 2, 1], [3, 2, 1])
tensor(14.7972, device='cuda:0', grad_fn=<NllLossBackward0>)
[(208895, -28.648757934570312), (499999, 0.0), (499999, -0.002504306845366955)]
([0, 2, 1], [3, 2, 1])
tensor(14.7943, device='cuda:0', grad_fn=<NllLossBackward0>)
[(208895, -29.513423919677734), (499999, 0.0), (499999, -0.003130383789539337)]
([0, 2, 1], [3, 2, 1])
tensor(14.7906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(208895, -29.770322799682617), (499999, 0.0), (499999

 74%|███████▎  | 534/725 [17:49:35<8:23:46, 158.25s/it]

test：0.0, test mean: 0.29473684210526313
([2, 0, 6], [2, 0, 6])
tensor(10.9533, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 74%|███████▍  | 535/725 [17:50:00<6:14:34, 118.28s/it]

([0, 1, 8], [0, 1, 8])
tensor(12.9485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 74%|███████▍  | 536/725 [17:50:26<4:45:11, 90.54s/it] 

([1, 8, 0], [1, 8, 7])
tensor(5.9939, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.020981768146157265), (499999, 0.0)]


 74%|███████▍  | 537/725 [17:50:51<3:42:35, 71.04s/it]

([2, 2, 0], [2, 2, 1])
tensor(19.1780, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 74%|███████▍  | 538/725 [17:51:20<3:01:30, 58.24s/it]

([0, 0, 1], [7, 0, 1])
tensor(4.4794, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -1.0613115364321857e-06), (499999, 0.0)]


 74%|███████▍  | 539/725 [17:51:46<2:30:35, 48.58s/it]

([2, 0, 1], [2, 0, 1])
tensor(15.0420, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 74%|███████▍  | 540/725 [17:52:14<2:10:43, 42.40s/it]

([2, 0, 1], [2, 0, 1])
tensor(15.3138, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -0.06427352875471115), (499999, 0.0)]
([2, 0, 1], [2, 0, 1])
tensor(15.3254, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 1.0262566804885864), (499999, 0.0)]
([2, 3, 1], [2, 0, 1])
tensor(17.1684, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 30.555341720581055), (499999, 0.0)]
([2, 3, 1], [2, 0, 1])
tensor(16.6924, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -15.529073715209961), (499999, 0.0)]
([2, 7, 1], [2, 0, 1])
tensor(17.3201, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 26.065750122070312), (499999, 0.0)]
([2, 1, 1], [2, 0, 1])
tensor(16.0134, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 94.01158905029297), (499999, 0.0)]
([2, 0, 1], [2, 0, 1])
tensor(15.2960, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 95.01158142089844),

 75%|███████▍  | 541/725 [17:57:03<5:57:33, 116.59s/it]

test：1.0, test mean: 0.29842931937172773
([8, 5, 0], [8, 5, 0])
tensor(4.2226, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0017633613897487521), (499999, 0.0)]


 75%|███████▍  | 542/725 [17:57:29<4:32:33, 89.36s/it] 

([1, 8, 5], [1, 8, 5])
tensor(15.5269, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 75%|███████▍  | 543/725 [17:57:57<3:34:42, 70.78s/it]

([8, 7, 2], [8, 1, 2])
tensor(16.5816, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 75%|███████▌  | 544/725 [17:58:21<2:51:44, 56.93s/it]

([8, 2, 0], [8, 2, 0])
tensor(11.5344, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 75%|███████▌  | 545/725 [17:58:47<2:22:33, 47.52s/it]

([3, 1, 2], [3, 1, 2])
tensor(26.0203, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -17.985275268554688), (499999, 0.0), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(25.3775, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -34.299705505371094), (499999, 0.0), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(24.0052, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -46.897762298583984), (499999, 0.0), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(22.8904, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -57.71553039550781), (499999, 0.0), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(22.1123, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -63.715763092041016), (499999, 0.0), (499999, 0.0)]
([3, 1, 2], [3, 1, 2])
tensor(21.2977, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -49.67995834350586), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [3, 1, 2])
tensor(20.8657, device='cuda:0', grad_fn=<NllLossBackward0>)
[(466943, -48.208778381347656), (499999, 0

 75%|███████▌  | 546/725 [18:03:08<5:32:47, 111.55s/it]

test：0.0, test mean: 0.296875
([7, 2, 0], [7, 2, 0])
tensor(15.5109, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 75%|███████▌  | 547/725 [18:03:32<4:13:44, 85.53s/it] 

([8, 2, 1], [5, 2, 1])
tensor(17.3192, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0037551410496234894), (499999, 0.0), (499999, 0.0)]


 76%|███████▌  | 548/725 [18:03:57<3:18:23, 67.25s/it]

([8, 2, 3], [8, 2, 3])
tensor(15.7734, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 76%|███████▌  | 549/725 [18:04:23<2:40:40, 54.77s/it]

([8, 7, 2], [8, 7, 2])
tensor(23.7037, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.712027072906494), (499999, 0.0)]
([8, 7, 2], [8, 7, 2])
tensor(22.1950, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 29.58999252319336), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(17.3485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 29.746870040893555), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(17.3483, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 29.857189178466797), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(17.3481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 29.933902740478516), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(17.3481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 30.010616302490234), (499999, 0.0)]
([8, 0, 2], [8, 7, 2])
tensor(17.3481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 30.07920265197754), (49

 76%|███████▌  | 550/725 [18:08:53<5:48:44, 119.57s/it]

test：0.0, test mean: 0.29533678756476683
([1, 8, 2], [1, 8, 2])
tensor(22.2345, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 76%|███████▌  | 551/725 [18:09:19<4:24:31, 91.22s/it] 

([2, 5, 0], [2, 5, 0])
tensor(16.1646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, -0.00011470117169665173)]
([2, 5, 0], [2, 5, 0])
tensor(16.1646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, -0.00023767205129843205)]
([2, 5, 0], [2, 5, 0])
tensor(16.1646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, -0.00036691961577162147)]
([2, 5, 0], [2, 5, 0])
tensor(16.1646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, -0.0004961672239005566)]
([2, 5, 0], [2, 5, 0])
tensor(16.1646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, -0.0006254147738218307)]
([2, 5, 0], [2, 5, 0])
tensor(16.1646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (417791, -0.0007546623819507658)]
([2, 5, 0], [2, 5, 0])
tensor(16.1646, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (49

 76%|███████▌  | 552/725 [18:14:05<7:12:00, 149.83s/it]

test：1.0, test mean: 0.29896907216494845
([7, 2, 1], [7, 2, 1])
tensor(16.6194, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 76%|███████▋  | 553/725 [18:14:31<5:22:41, 112.57s/it]

([0, 0, 7], [7, 0, 0])
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([0, 0, 7], [7, 0, 0])
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([0, 0, 7], [7, 0, 0])
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([0, 0, 7], [7, 0, 0])
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([0, 0, 7], [7, 0, 0])
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([0, 0, 7], [7, 0, 0])
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([0, 0, 7], [7, 0, 0])
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (430079, 0.0)]
([0, 0, 7], [7, 0, 0])
tensor(1.4986, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0)

 76%|███████▋  | 554/725 [18:19:12<7:44:58, 163.15s/it]

test：0.0, test mean: 0.29743589743589743
([2, 2, 2], [2, 2, 2])
tensor(28.3043, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 555/725 [18:19:38<5:45:57, 122.11s/it]

([7, 1, 1], [7, 1, 1])
tensor(24.6057, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(22.6955, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.278846740722656), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(18.4872, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 56.69413375854492), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(16.7387, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 56.6322021484375), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(16.7387, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 56.57536315917969), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(16.7386, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 56.521759033203125), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(16.7385, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 56.46694564819336), (499999, 0.0), (4999

 77%|███████▋  | 556/725 [18:24:24<8:01:56, 171.10s/it]

test：0.0, test mean: 0.29591836734693877
([1, 5, 0], [1, 2, 0])
tensor(9.3485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -2.9760180041193962e-05)]
([1, 5, 0], [1, 2, 0])
tensor(9.3485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, 3.6923149309586734e-05)]
([1, 5, 0], [1, 2, 0])
tensor(9.3487, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -0.003366508986800909)]
([1, 5, 0], [1, 2, 0])
tensor(9.3485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -0.00333597999997437)]
([1, 5, 0], [1, 2, 0])
tensor(9.3485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -0.0033073178492486477)]
([1, 5, 0], [1, 2, 0])
tensor(9.3485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (442367, -0.003281537676230073)]
([1, 5, 0], [1, 2, 0])
tensor(9.3485, device='cuda:0', grad_fn=<NllLossBac

 77%|███████▋  | 557/725 [18:29:13<9:38:26, 206.59s/it]

test：1.0, test mean: 0.29949238578680204
([1, 8, 1], [1, 8, 1])
tensor(21.9562, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 558/725 [18:29:39<7:04:24, 152.48s/it]

([7, 7, 8], [7, 7, 8])
tensor(19.3812, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.9181559085845947), (380927, 2.7659730911254883), (499999, 0.0)]
([7, 7, 8], [7, 7, 8])
tensor(12.7664, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 51.49346160888672), (380927, 22.097217559814453), (499999, 0.0)]
([7, 7, 8], [7, 7, 8])
tensor(7.7970, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 55.539939880371094), (380927, 14.348654747009277), (499999, 0.0)]
([0, 0, 8], [7, 7, 8])
tensor(6.1921, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 56.08805847167969), (380927, 14.348651885986328), (499999, 0.0)]
([0, 0, 8], [7, 7, 8])
tensor(6.1899, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 56.185585021972656), (380927, 14.348652839660645), (499999, 0.0)]
([0, 0, 8], [7, 7, 8])
tensor(6.1896, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 56.22801971435547), (380927, 14.348651885986328), (499999, 0.0)]
([0, 0, 8], [7, 7, 8])
tensor(6.1896, device='c

 77%|███████▋  | 559/725 [18:34:12<8:41:22, 188.45s/it]

test：0.0, test mean: 0.29797979797979796
([7, 7, 0], [7, 7, 0])
tensor(5.8067, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 560/725 [18:34:37<6:23:25, 139.43s/it]

([1, 8, 2], [1, 0, 2])
tensor(17.2848, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 77%|███████▋  | 561/725 [18:35:01<4:47:02, 105.01s/it]

([0, 2, 2], [0, 2, 2])
tensor(15.3481, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 562/725 [18:35:26<3:39:31, 80.80s/it] 

([8, 5, 3], [8, 5, 3])
tensor(10.7185, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.002769689541310072)]


 78%|███████▊  | 563/725 [18:35:50<2:52:36, 63.93s/it]

([8, 2, 3], [5, 2, 3])
tensor(16.2442, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 564/725 [18:36:16<2:20:43, 52.44s/it]

([2, 0, 7], [2, 0, 7])
tensor(13.8527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.888364791870117)]
([2, 0, 7], [2, 0, 7])
tensor(12.5622, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 20.35379409790039)]
([2, 0, 7], [2, 0, 7])
tensor(8.9554, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 136.2168731689453)]
([2, 0, 0], [2, 0, 7])
tensor(7.5452, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 136.42202758789062)]
([2, 0, 0], [2, 0, 7])
tensor(7.5278, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 136.38323974609375)]
([2, 0, 0], [2, 0, 7])
tensor(7.5260, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 136.35946655273438)]
([2, 0, 0], [2, 0, 7])
tensor(7.5227, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 136.3500061035

 78%|███████▊  | 565/725 [18:41:11<5:33:48, 125.18s/it]

test：0.0, test mean: 0.2964824120603015
([2, 2, 3], [2, 2, 3])
tensor(25.3081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 566/725 [18:41:36<4:12:31, 95.29s/it] 

([2, 1, 1], [2, 1, 1])
tensor(23.4057, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 567/725 [18:42:00<3:14:33, 73.88s/it]

([8, 1, 7], [8, 1, 7])
tensor(12.9359, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 78%|███████▊  | 568/725 [18:42:24<2:34:08, 58.91s/it]

([1, 7, 7], [1, 7, 7])
tensor(16.9480, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.7659730911254883), (499999, 0.0)]
([1, 7, 7], [1, 7, 7])
tensor(13.8351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 23.689777374267578), (499999, 0.0)]
([1, 7, 7], [1, 7, 7])
tensor(11.5540, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 84.82192993164062), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(8.5810, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 84.66059112548828), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(8.5789, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 84.4793701171875), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(8.5779, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 84.34574890136719), (499999, 0.0)]
([1, 0, 7], [1, 7, 7])
tensor(8.5779, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 84.21402740478516), (499999, 0

 78%|███████▊  | 569/725 [18:47:00<5:21:57, 123.83s/it]

test：0.0, test mean: 0.295
([1, 2, 1], [1, 2, 1])
tensor(10.3881, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 79%|███████▊  | 570/725 [18:47:25<4:03:33, 94.28s/it] 

([2, 6, 8], [2, 6, 8])
tensor(20.8781, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 79%|███████▉  | 571/725 [18:47:50<3:08:39, 73.50s/it]

([2, 1, 1], [2, 1, 1])
tensor(23.4372, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.005405490752309561)]


 79%|███████▉  | 572/725 [18:48:14<2:29:57, 58.81s/it]

([8, 1, 5], [8, 1, 5])
tensor(13.6995, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 79%|███████▉  | 573/725 [18:48:40<2:03:40, 48.82s/it]

([1, 2, 2], [1, 2, 2])
tensor(30.5761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 79%|███████▉  | 574/725 [18:49:02<1:42:47, 40.84s/it]

([0, 7, 0], [0, 7, 0])
tensor(6.2222, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -0.5811673998832703), (499999, 0.0)]
([0, 7, 0], [0, 7, 0])
tensor(4.9185, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 53.007720947265625), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.0192, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 57.520721435546875), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 57.52135467529297), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 57.52198791503906), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 57.52262496948242), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.0020, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 57.523258209228516), (499999, 

 79%|███████▉  | 575/725 [18:53:52<4:49:06, 115.64s/it]

test：0.0, test mean: 0.2935323383084577
([7, 1, 2], [7, 1, 2])
tensor(19.2336, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.7659730911254883), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(17.3464, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 10.10091781616211), (499999, 0.0), (499999, 0.0)]
([7, 1, 2], [7, 1, 2])
tensor(13.1858, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 56.623626708984375), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(11.1733, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 56.517974853515625), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(11.1732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 56.41278076171875), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(11.1731, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 56.312259674072266), (499999, 0.0), (499999, 0.0)]
([0, 1, 2], [7, 1, 2])
tensor(11.1727, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927

 79%|███████▉  | 576/725 [18:58:39<6:54:45, 167.02s/it]

test：0.0, test mean: 0.29207920792079206
([2, 0, 5], [2, 0, 2])
tensor(15.0276, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|███████▉  | 577/725 [18:59:07<5:08:44, 125.17s/it]

([1, 0, 0], [1, 0, 0])
tensor(8.9558, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|███████▉  | 578/725 [18:59:33<3:53:56, 95.49s/it] 

([3, 2, 1], [3, 2, 1])
tensor(16.0100, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|███████▉  | 579/725 [18:59:58<3:00:51, 74.33s/it]

([2, 5, 8], [2, 5, 8])
tensor(15.4669, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|████████  | 580/725 [19:00:22<2:22:57, 59.16s/it]

([1, 2, 0], [1, 2, 0])
tensor(17.6306, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|████████  | 581/725 [19:00:47<1:57:16, 48.86s/it]

([8, 2, 8], [8, 2, 8])
tensor(14.2029, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|████████  | 582/725 [19:01:13<1:40:04, 41.99s/it]

([2, 7, 2], [2, 7, 2])
tensor(25.6635, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 80%|████████  | 583/725 [19:01:40<1:28:41, 37.48s/it]

([1, 2, 0], [1, 2, 0])
tensor(15.5935, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -1.946892552950885e-05)]


 81%|████████  | 584/725 [19:02:07<1:20:51, 34.40s/it]

([2, 1, 0], [2, 1, 0])
tensor(15.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -6.806040175888484e-08)]
([2, 1, 0], [2, 1, 0])
tensor(15.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.1879475891873881e-07)]
([2, 1, 0], [2, 1, 0])
tensor(15.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.2173556740435743e-07)]
([2, 1, 0], [2, 1, 0])
tensor(15.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.2474536958961835e-07)]
([2, 1, 0], [2, 1, 0])
tensor(15.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.276384296033939e-07)]
([2, 1, 0], [2, 1, 0])
tensor(15.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, -1.3052085989784246e-07)]
([2, 1, 0], [2, 1, 0])
tensor(15.9436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (4

 81%|████████  | 585/725 [19:06:58<4:19:55, 111.39s/it]

test：1.0, test mean: 0.2955665024630542
([2, 2, 2], [2, 2, 2])
tensor(33.4869, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 81%|████████  | 586/725 [19:07:23<3:17:52, 85.42s/it] 

([5, 1, 7], [5, 1, 7])
tensor(11.5753, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.024820121005177498), (499999, 0.0), (499999, 0.0)]


 81%|████████  | 587/725 [19:07:47<2:34:20, 67.10s/it]

([5, 0, 8], [5, 4, 8])
tensor(8.4476, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 81%|████████  | 588/725 [19:08:12<2:04:12, 54.40s/it]

([1, 7, 1], [1, 7, 1])
tensor(13.1551, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 81%|████████  | 589/725 [19:08:37<1:43:16, 45.56s/it]

([2, 8, 2], [2, 8, 2])
tensor(30.5866, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 81%|████████▏ | 590/725 [19:09:02<1:29:11, 39.64s/it]

([1, 6, 0], [1, 6, 0])
tensor(12.2303, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 82%|████████▏ | 591/725 [19:09:26<1:18:03, 34.95s/it]

([5, 0, 2], [2, 0, 2])
tensor(10.4127, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.005727349780499935), (466943, 0.0017641325248405337), (499999, 0.0)]
([5, 0, 2], [2, 0, 2])
tensor(10.4124, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.01145469956099987), (466943, 4.083511885255575e-05), (499999, 0.0)]
([5, 0, 2], [2, 0, 2])
tensor(10.4123, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.017182040959596634), (466943, -0.001087931334041059), (499999, 0.0)]
([5, 0, 2], [2, 0, 2])
tensor(10.4123, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.02290939912199974), (466943, -0.0011006623972207308), (499999, 0.0)]
([5, 0, 2], [2, 0, 2])
tensor(10.4123, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.028636738657951355), (466943, -0.001117766834795475), (499999, 0.0)]
([5, 0, 2], [2, 0, 2])
tensor(10.4123, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.03436408191919327), (466943, -0.0011340935016050935), (499999, 0.0)]
([5, 0, 2],

 82%|████████▏ | 592/725 [19:13:59<3:55:19, 106.16s/it]

test：1.0, test mean: 0.29901960784313725
([5, 0, 8], [5, 0, 8])
tensor(10.3304, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -7.292678356170654), (356351, 9.672621672507375e-05), (499999, 0.0)]
([5, 0, 8], [5, 0, 8])
tensor(8.8040, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -58.96889114379883), (356351, 0.0005141779547557235), (499999, 0.0)]
([0, 0, 8], [5, 0, 8])
tensor(5.2712, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -58.96839141845703), (356351, 0.0005087923491373658), (499999, 0.0)]
([0, 0, 8], [5, 0, 8])
tensor(5.2712, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -58.96763610839844), (356351, 0.0005090109771117568), (499999, 0.0)]
([0, 0, 8], [5, 0, 8])
tensor(5.2714, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -58.96052551269531), (356351, 0.0005087777972221375), (499999, 0.0)]
([0, 7, 8], [5, 0, 8])
tensor(6.1690, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -58.753936767578125), (356351, 29.172086715698242), (49

 82%|████████▏ | 593/725 [19:18:29<5:41:52, 155.40s/it]

test：0.5, test mean: 0.3
([1, 0, 5], [1, 0, 5])
tensor(10.6454, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.013334263116121292)]


 82%|████████▏ | 594/725 [19:18:55<4:14:10, 116.42s/it]

([7, 0, 7], [7, 0, 7])
tensor(16.4500, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.712027072906494), (499999, 6.656461160048366e-09), (380927, 2.9181621074676514)]
([7, 0, 7], [7, 0, 7])
tensor(9.2341, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.895052909851074), (499999, 1.3312922320096732e-08), (380927, 25.859825134277344)]
([7, 0, 7], [7, 0, 7])
tensor(5.6388, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.190286636352539), (499999, 1.9969384368323517e-08), (380927, 45.07191848754883)]
([0, 0, 0], [7, 0, 7])
tensor(0.0187, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.183858871459961), (499999, 2.6625844640193463e-08), (380927, 45.08619689941406)]
([0, 0, 0], [7, 0, 7])
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.177431106567383), (499999, 3.328230846477709e-08), (380927, 45.087650299072266)]
([0, 0, 0], [7, 0, 7])
tensor(0.0003, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 13.170999526977539), 

 82%|████████▏ | 595/725 [19:23:46<6:06:12, 169.02s/it]

test：0.0, test mean: 0.29854368932038833
([2, 2, 7], [2, 2, 7])
tensor(15.4277, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 82%|████████▏ | 596/725 [19:24:15<4:32:45, 126.86s/it]

([1, 2, 2], [1, 2, 2])
tensor(25.2035, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 82%|████████▏ | 597/725 [19:24:39<3:25:07, 96.16s/it] 

([3, 4, 6], [3, 4, 6])
tensor(7.2507, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -17.19189453125), (499999, 0.0), (499999, 0.0)]
([3, 4, 6], [3, 4, 6])
tensor(6.8376, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -30.917537689208984), (499999, 0.0), (499999, 0.0)]
([3, 4, 6], [3, 4, 6])
tensor(4.6355, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -27.519906997680664), (499999, 0.0), (499999, 0.0)]
([3, 4, 6], [3, 4, 6])
tensor(3.2133, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -48.426063537597656), (499999, 0.0), (499999, 0.0)]
([3, 4, 6], [3, 4, 6])
tensor(3.2626, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -56.37257385253906), (499999, 0.0), (499999, 0.0)]
([0, 4, 6], [3, 4, 6])
tensor(2.7177, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -56.87102508544922), (499999, 0.0), (499999, 0.0)]
([0, 4, 6], [3, 4, 6])
tensor(2.7058, device='cuda:0', grad_fn=<NllLossBackward0>)
[(233471, -57.50735092163086), (499999, 0.0), (499999

 82%|████████▏ | 598/725 [19:29:05<5:11:12, 147.03s/it]

test：0.0, test mean: 0.2971014492753623
([1, 2, 2], [1, 2, 2])
tensor(20.4037, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 599/725 [19:29:27<3:50:01, 109.53s/it]

([2, 0, 3], [2, 0, 3])
tensor(14.3573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -0.0001537875650683418), (499999, 0.0)]
([2, 0, 3], [2, 0, 3])
tensor(14.3573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -0.0003075751010328531), (499999, 0.0)]
([2, 0, 3], [2, 0, 3])
tensor(14.3573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -0.0004613626515492797), (499999, 0.0)]
([2, 0, 3], [2, 0, 3])
tensor(14.3573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -0.0006151502020657063), (499999, 0.0)]
([2, 0, 3], [2, 0, 3])
tensor(14.3573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -0.0007689377525821328), (499999, 0.0)]
([2, 0, 3], [2, 0, 3])
tensor(14.3573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (430079, -0.0009227253030985594), (499999, 0.0)]
([2, 0, 3], [2, 0, 3])
tensor(14.3573, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (43007

 83%|████████▎ | 600/725 [19:34:04<5:32:37, 159.66s/it]

test：1.0, test mean: 0.3004807692307692
([5, 1, 1], [5, 1, 1])
tensor(20.2633, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -6.279983790591359e-05), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 601/725 [19:34:27<4:05:27, 118.77s/it]

([0, 1, 5], [0, 1, 5])
tensor(8.4179, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 602/725 [19:34:53<3:06:32, 91.00s/it] 

([1, 0, 0], [1, 0, 0])
tensor(3.8436, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.0029562152922153473), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(3.8456, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -0.23350566625595093), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(3.9456, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -0.3964197635650635), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(3.8940, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 2.7696876525878906), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(3.8351, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 2.7577438354492188), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(3.8350, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 2.7470955848693848), (499999, 0.0)]
([1, 0, 0], [1, 0, 0])
tensor(3.8349, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 2.746910572052002), (4

 83%|████████▎ | 603/725 [19:39:23<4:53:52, 144.53s/it]

test：1.0, test mean: 0.3038277511961722
([7, 0, 7], [7, 0, 7])
tensor(9.7069, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (442367, 3.8714040329068666e-07), (499999, 0.0)]
([7, 0, 7], [7, 0, 7])
tensor(6.8497, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.223464965820312), (442367, 1.1643396646832116e-05), (499999, 0.0)]
([7, 0, 7], [7, 0, 7])
tensor(5.0865, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.909973621368408), (442367, 1.164314744528383e-05), (499999, 0.0)]
([0, 0, 7], [7, 0, 7])
tensor(1.1251, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.829916000366211), (442367, 1.1550175258889794e-05), (499999, 0.0)]
([0, 0, 7], [7, 0, 7])
tensor(1.1251, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.749860763549805), (442367, 1.15502298285719e-05), (499999, 0.0)]
([0, 0, 7], [7, 0, 7])
tensor(1.1246, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -6.689143657684326), (442367, 1.153508128481917e-05), (4

 83%|████████▎ | 604/725 [19:43:58<6:10:48, 183.87s/it]

test：0.5, test mean: 0.3047619047619048
([2, 2, 5], [2, 2, 3])
tensor(26.8016, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 83%|████████▎ | 605/725 [19:44:25<4:33:07, 136.56s/it]

([2, 8, 6], [2, 8, 6])
tensor(17.2680, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▎ | 606/725 [19:44:49<3:24:14, 102.98s/it]

([8, 1, 0], [8, 1, 0])
tensor(8.6009, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▎ | 607/725 [19:45:13<2:35:38, 79.14s/it] 

([2, 0, 5], [2, 0, 5])
tensor(15.4508, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.029038066044449806)]


 84%|████████▍ | 608/725 [19:45:36<2:01:26, 62.28s/it]

([2, 1, 2], [2, 1, 2])
tensor(35.0191, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▍ | 609/725 [19:46:00<1:38:31, 50.96s/it]

([2, 7, 5], [2, 7, 5])
tensor(12.7315, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▍ | 610/725 [19:46:25<1:22:51, 43.23s/it]

([2, 1, 8], [2, 1, 8])
tensor(22.4028, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 84%|████████▍ | 611/725 [19:46:51<1:12:03, 37.93s/it]

([2, 7, 1], [2, 7, 1])
tensor(17.4396, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.712027072906494), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(15.8225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 32.15268325805664), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(10.6994, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 41.08137512207031), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(10.6527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 41.08271789550781), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(10.6527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 41.08406066894531), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(10.6527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 41.08540344238281), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(10.6527, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 41.08675003051758), (499999

 84%|████████▍ | 612/725 [19:51:21<3:22:23, 107.47s/it]

test：0.0, test mean: 0.3033175355450237
([2, 7, 0], [2, 7, 0])
tensor(15.7488, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.918126344680786), (454655, -3.0888184454624934e-08)]
([2, 7, 0], [2, 7, 0])
tensor(12.1983, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 63.53908157348633), (454655, -6.177636890924987e-08)]
([2, 3, 0], [2, 7, 0])
tensor(10.7798, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 58.276390075683594), (454655, -9.266454981116112e-08)]
([2, 7, 0], [2, 7, 0])
tensor(10.9167, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 35.31264114379883), (454655, -1.2355273781849974e-07)]
([2, 0, 0], [2, 7, 0])
tensor(9.4794, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 35.3179931640625), (454655, -1.5444092582583835e-07)]
([2, 0, 0], [2, 7, 0])
tensor(9.4794, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 35.32124328613281), (454655, -1.853

 85%|████████▍ | 613/725 [19:56:01<4:57:34, 159.42s/it]

test：0.5, test mean: 0.30424528301886794
([1, 0, 2], [1, 0, 2])
tensor(14.0455, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 85%|████████▍ | 614/725 [19:56:25<3:39:31, 118.66s/it]

([0, 6, 8], [0, 6, 8])
tensor(4.8362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -6.306866453087423e-06), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(4.8362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.00010161664977204055), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(4.8362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.001382275833748281), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(4.8362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.0015050144866108894), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(4.8362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.0017666200874373317), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(4.8362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.0008216567803174257), (499999, 0.0), (499999, 0.0)]
([0, 6, 8], [0, 6, 8])
tensor(4.8362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, -0.0011895413044840

 85%|████████▍ | 615/725 [20:00:53<4:59:33, 163.39s/it]

test：1.0, test mean: 0.3075117370892019
([1, 2, 2], [1, 2, 2])
tensor(27.1718, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 85%|████████▍ | 616/725 [20:01:17<3:41:11, 121.76s/it]

([7, 1, 7], [7, 1, 7])
tensor(21.0395, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.712027072906494), (499999, 0.0), (380927, 2.9181621074676514)]
([7, 1, 7], [7, 1, 7])
tensor(16.6203, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 18.180551528930664), (499999, 0.0), (380927, 20.06083869934082)]
([0, 1, 7], [7, 1, 7])
tensor(8.7137, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 27.44565200805664), (499999, 0.0), (380927, 13.937744140625)]
([0, 1, 0], [7, 1, 7])
tensor(6.3486, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 27.368694305419922), (499999, 0.0), (380927, 13.924232482910156)]
([0, 1, 0], [7, 1, 7])
tensor(6.3485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 27.297428131103516), (499999, 0.0), (380927, 13.91071891784668)]
([0, 1, 0], [7, 1, 7])
tensor(6.3485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 27.22616958618164), (499999, 0.0), (380927, 13.897205352783203)]
([0, 1, 0], [7, 1, 7])
tensor(6.3485, device='cuda:0

 85%|████████▌ | 617/725 [20:06:03<5:07:29, 170.83s/it]

test：0.0, test mean: 0.3060747663551402
([0, 0, 1], [0, 0, 1])
tensor(5.4118, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 5.748896683144267e-07), (499999, 0.0), (499999, 0.0)]


 85%|████████▌ | 618/725 [20:06:29<3:47:12, 127.41s/it]

([0, 5, 0], [0, 5, 0])
tensor(3.4728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 9.568329915055074e-07)]
([0, 5, 0], [0, 5, 0])
tensor(3.4728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 0.00015006349713075906)]
([0, 5, 0], [0, 5, 0])
tensor(3.4728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 0.0007608133601024747)]
([0, 5, 0], [0, 5, 0])
tensor(3.4728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 0.0007607918232679367)]
([0, 5, 0], [0, 5, 0])
tensor(3.4728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 0.0007607704028487206)]
([0, 5, 0], [0, 5, 0])
tensor(3.4728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 0.0007607489242218435)]
([0, 5, 0], [0, 5, 0])
tensor(3.4728, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (49

 85%|████████▌ | 619/725 [20:11:10<5:06:39, 173.58s/it]

test：1.0, test mean: 0.30930232558139537
([0, 5, 5], [0, 0, 5])
tensor(4.0067, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▌ | 620/725 [20:11:35<3:45:59, 129.14s/it]

([2, 5, 1], [2, 3, 1])
tensor(22.1946, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.012345748022198677), (499999, 0.0)]


 86%|████████▌ | 621/725 [20:11:58<2:48:20, 97.12s/it] 

([1, 1, 5], [1, 1, 5])
tensor(20.3030, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 15.92171859741211)]
([1, 1, 8], [1, 1, 5])
tensor(19.4883, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 50.72854995727539)]
([1, 1, 0], [1, 1, 5])
tensor(12.8320, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 49.3968505859375)]
([1, 1, 0], [1, 1, 5])
tensor(12.8147, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 49.396541595458984)]
([1, 1, 0], [1, 1, 5])
tensor(12.8147, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 49.39623260498047)]
([1, 1, 0], [1, 1, 5])
tensor(12.8168, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 49.82863998413086)]
([1, 1, 0], [1, 1, 5])
tensor(12.8162, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, 50.186538696

 86%|████████▌ | 622/725 [20:16:33<4:18:26, 150.55s/it]

test：0.0, test mean: 0.30787037037037035
([2, 2, 1], [2, 2, 1])
tensor(22.2129, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▌ | 623/725 [20:17:00<3:12:39, 113.33s/it]

([2, 8, 1], [2, 8, 1])
tensor(20.9362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▌ | 624/725 [20:17:24<2:25:41, 86.55s/it] 

([2, 2, 2], [2, 2, 2])
tensor(25.3987, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▌ | 625/725 [20:17:46<1:52:04, 67.25s/it]

([5, 0, 2], [5, 4, 2])
tensor(15.0682, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 86%|████████▋ | 626/725 [20:18:08<1:28:41, 53.75s/it]

([5, 2, 1], [5, 2, 1])
tensor(25.5805, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -2.8537793159484863), (499999, 0.0), (499999, 0.0)]
([8, 2, 1], [5, 2, 1])
tensor(23.8754, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 7.953456401824951), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(18.2335, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 16.362232208251953), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(18.1718, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 16.362213134765625), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(18.1718, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 16.36219596862793), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(18.1718, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 16.362173080444336), (499999, 0.0), (499999, 0.0)]
([0, 2, 1], [5, 2, 1])
tensor(18.1718, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, 16.36215591430664), (499999, 0.0), (4

 86%|████████▋ | 627/725 [20:22:41<3:15:01, 119.40s/it]

test：0.0, test mean: 0.3064516129032258
([1, 7, 0], [1, 7, 0])
tensor(11.4150, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 87%|████████▋ | 628/725 [20:23:04<2:26:35, 90.67s/it] 

([0, 0, 2], [0, 0, 2])
tensor(10.0900, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 1.6266548641397094e-07), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(10.1137, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 4.609086513519287), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(10.0900, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 4.609086990356445), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(10.0900, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 4.609112739562988), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(10.0900, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 4.609112739562988), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(10.0900, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 4.609127521514893), (499999, 0.0)]
([0, 0, 2], [0, 0, 2])
tensor(10.0900, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (491519, 4.609127521514893), (4

 87%|████████▋ | 629/725 [20:27:33<3:50:43, 144.20s/it]

test：1.0, test mean: 0.30963302752293576
([0, 1, 5], [0, 1, 5])
tensor(12.1504, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 87%|████████▋ | 630/725 [20:27:57<2:51:11, 108.12s/it]

([0, 0, 1], [6, 0, 1])
tensor(7.7001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 1.274079204449663e-05), (499999, 0.0)]
([0, 0, 1], [6, 0, 1])
tensor(7.7001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.00010473870497662574), (499999, 0.0)]
([0, 0, 1], [6, 0, 1])
tensor(7.7001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.0001047489931806922), (499999, 0.0)]
([0, 0, 1], [6, 0, 1])
tensor(7.7001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.00010475928866071627), (499999, 0.0)]
([0, 0, 1], [6, 0, 1])
tensor(7.7001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.00010476959869265556), (499999, 0.0)]
([0, 0, 1], [6, 0, 1])
tensor(7.7001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.00010477953037479892), (499999, 0.0)]
([0, 0, 1], [6, 0, 1])
tensor(7.7001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, 0.0001

 87%|████████▋ | 631/725 [20:32:40<4:11:24, 160.47s/it]

test：1.0, test mean: 0.3127853881278539
([6, 5, 6], [6, 5, 6])
tensor(10.3153, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 87%|████████▋ | 632/725 [20:33:04<3:05:19, 119.56s/it]

([1, 0, 2], [1, 0, 2])
tensor(14.8521, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 87%|████████▋ | 633/725 [20:33:30<2:20:12, 91.44s/it] 

([7, 5, 0], [7, 5, 0])
tensor(10.6909, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 5, 0], [7, 5, 0])
tensor(7.1836, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 17.590667724609375), (499999, 0.0), (499999, 0.0)]
([7, 5, 0], [7, 5, 0])
tensor(6.7001, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.870208740234375), (499999, 0.0), (499999, 0.0)]
([7, 5, 0], [7, 5, 0])
tensor(3.0932, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.006977081298828), (499999, 0.0), (499999, 0.0)]
([0, 5, 0], [7, 5, 0])
tensor(1.8146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.0113372802734375), (499999, 0.0), (499999, 0.0)]
([0, 5, 0], [7, 5, 0])
tensor(1.8146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.0156965255737305), (499999, 0.0), (499999, 0.0)]
([0, 5, 0], [7, 5, 0])
tensor(1.8145, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, -4.018598556518555), (499999, 0.0), (499

 87%|████████▋ | 634/725 [20:38:00<3:39:56, 145.02s/it]

test：0.0, test mean: 0.31136363636363634
([2, 2, 1], [2, 2, 1])
tensor(26.7970, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 88%|████████▊ | 635/725 [20:38:25<2:43:31, 109.02s/it]

([0, 6, 2], [0, 6, 2])
tensor(9.6748, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 88%|████████▊ | 636/725 [20:38:49<2:04:00, 83.60s/it] 

([2, 6, 2], [2, 6, 2])
tensor(21.4162, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 88%|████████▊ | 637/725 [20:39:13<1:36:26, 65.76s/it]

([3, 2, 7], [7, 2, 7])
tensor(17.5889, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9369421005249023)]
([3, 2, 7], [7, 2, 7])
tensor(14.3109, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 24.761234283447266)]
([3, 2, 7], [7, 2, 7])
tensor(11.5348, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 1.757192611694336)]
([3, 2, 0], [7, 2, 7])
tensor(11.1829, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 1.7571582794189453)]
([3, 2, 0], [7, 2, 7])
tensor(11.1829, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 1.7571277618408203)]
([3, 2, 0], [7, 2, 7])
tensor(11.1829, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 1.7570934295654297)]
([3, 2, 0], [7, 2, 7])
tensor(11.1829, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 1.75706

 88%|████████▊ | 638/725 [20:44:09<3:15:22, 134.74s/it]

test：0.0, test mean: 0.30995475113122173
([8, 2, 2], [8, 2, 2])
tensor(27.1440, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.011059565469622612), (499999, 0.0)]


 88%|████████▊ | 639/725 [20:44:33<2:25:41, 101.64s/it]

([1, 8, 2], [1, 8, 2])
tensor(18.7712, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 88%|████████▊ | 640/725 [20:44:58<1:51:25, 78.65s/it] 

([3, 2, 3], [3, 2, 5])
tensor(16.8389, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (159743, -11.525435447692871)]
([3, 2, 3], [3, 2, 5])
tensor(15.7539, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (159743, -10.267362594604492)]
([3, 2, 0], [3, 2, 5])
tensor(14.5443, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (159743, -10.365676879882812)]
([3, 2, 0], [3, 2, 5])
tensor(14.5427, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (159743, -10.364093780517578)]
([3, 2, 0], [3, 2, 5])
tensor(14.5427, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (159743, -10.364093780517578)]
([3, 2, 0], [3, 2, 5])
tensor(14.5427, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (159743, -10.364093780517578)]
([3, 2, 0], [3, 2, 5])
tensor(14.5427, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (159743, 

 88%|████████▊ | 641/725 [20:50:28<3:35:40, 154.05s/it]

test：0.0, test mean: 0.30855855855855857
([2, 2, 3], [2, 2, 3])
tensor(24.9783, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.010405882261693478), (454655, -17.191205978393555)]
([2, 2, 3], [2, 2, 3])
tensor(24.7338, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.020811764523386955), (454655, -28.5147705078125)]
([2, 2, 3], [2, 2, 3])
tensor(22.9589, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.03121764585375786), (454655, -23.600627899169922)]
([2, 2, 3], [2, 2, 3])
tensor(21.3672, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.04162352904677391), (454655, -22.810592651367188)]
([2, 2, 0], [2, 2, 3])
tensor(20.6606, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.052029408514499664), (454655, -15.473100662231445)]
([2, 2, 0], [2, 2, 3])
tensor(20.5977, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.06243529170751572), (454655, -1

 89%|████████▊ | 642/725 [20:55:06<4:24:27, 191.17s/it]

test：0.0, test mean: 0.3071748878923767
([2, 7, 2], [2, 7, 2])
tensor(23.2496, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9369421005249023), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(21.0652, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 22.152923583984375), (499999, 0.0)]
([2, 7, 2], [2, 7, 2])
tensor(17.9657, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -6.287190914154053), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(16.4762, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -6.239933013916016), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(16.4761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -6.216575622558594), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(16.4761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, -6.19321870803833), (499999, 0.0)]
([2, 0, 2], [2, 7, 2])
tensor(16.4761, device='cuda:0', grad_fn=<NllLossBackward0>)
[(49999

 89%|████████▊ | 643/725 [20:59:49<4:58:50, 218.67s/it]

test：0.0, test mean: 0.30580357142857145
([2, 8, 3], [2, 8, 3])
tensor(21.6676, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 89%|████████▉ | 644/725 [21:00:15<3:37:08, 160.84s/it]

([6, 1, 1], [6, 1, 1])
tensor(18.8733, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 89%|████████▉ | 645/725 [21:00:41<2:40:26, 120.33s/it]

([1, 2, 2], [1, 2, 2])
tensor(31.3398, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 89%|████████▉ | 646/725 [21:01:06<2:00:51, 91.79s/it] 

([2, 8, 0], [2, 8, 0])
tensor(15.7320, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 89%|████████▉ | 647/725 [21:01:31<1:33:26, 71.88s/it]

([0, 2, 1], [0, 2, 1])
tensor(11.1023, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.04147661477327347)]


 89%|████████▉ | 648/725 [21:01:58<1:14:43, 58.22s/it]

([0, 7, 0], [0, 7, 0])
tensor(6.7462, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([0, 7, 0], [0, 7, 0])
tensor(4.6830, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 21.84639549255371), (499999, 0.0)]
([0, 7, 0], [0, 7, 0])
tensor(3.8580, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 20.22881317138672), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.1173, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 17.570005416870117), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.0917, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 17.948219299316406), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.0906, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 18.106000900268555), (499999, 0.0)]
([0, 0, 0], [0, 7, 0])
tensor(0.0903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 18.198509216308594), (499999, 

 90%|████████▉ | 649/725 [21:06:37<2:37:50, 124.61s/it]

test：0.0, test mean: 0.30444444444444446
([2, 2, 1], [2, 2, 1])
tensor(25.2809, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 90%|████████▉ | 650/725 [21:07:02<1:58:21, 94.69s/it] 

([5, 7, 2], [5, 7, 2])
tensor(21.2289, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -2.662940502166748), (380927, 2.9181621074676514), (499999, 0.0)]
([5, 7, 2], [5, 7, 2])
tensor(17.5914, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -21.766372680664062), (380927, 19.355602264404297), (499999, 0.0)]
([0, 7, 2], [5, 7, 2])
tensor(16.6439, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -21.762645721435547), (380927, 15.591690063476562), (499999, 0.0)]
([0, 7, 2], [5, 7, 2])
tensor(12.9263, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -21.75254249572754), (380927, 20.236967086791992), (499999, 0.0)]
([0, 0, 2], [5, 7, 2])
tensor(11.8783, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -21.739192962646484), (380927, 20.226377487182617), (499999, 0.0)]
([0, 0, 2], [5, 7, 2])
tensor(11.8783, device='cuda:0', grad_fn=<NllLossBackward0>)
[(491519, -21.725841522216797), (380927, 20.215791702270508), (499999, 0.0)]
([0, 0, 2], [5, 7, 2])
tensor(11.878

 90%|████████▉ | 651/725 [21:11:38<3:03:39, 148.92s/it]

test：0.0, test mean: 0.3030973451327434
([1, 8, 2], [1, 8, 2])
tensor(20.3893, device='cuda:0', grad_fn=<NllLossBackward0>)


 90%|████████▉ | 652/725 [21:12:04<2:16:23, 112.11s/it]

[(499999, 0.0), (499999, 0.0), (499999, 0.0)]
([0, 7, 5], [0, 7, 5])
tensor(10.0789, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 2.376247110191798e-08), (380927, 3.1132798194885254), (499999, 0.0)]
([0, 7, 5], [0, 7, 5])
tensor(8.3898, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -4.4228560369674597e-08), (380927, 5.522191047668457), (499999, 0.0)]
([0, 7, 5], [0, 7, 5])
tensor(6.5525, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.12390042841434479), (380927, -5.562060356140137), (499999, 0.0)]
([0, 0, 5], [0, 7, 5])
tensor(4.0452, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.123900406062603), (380927, -5.629251480102539), (499999, 0.0)]
([0, 0, 5], [0, 7, 5])
tensor(4.0452, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -0.12390047311782837), (380927, -5.696442604064941), (499999, 0.0)]
([0, 0, 5], [0, 7, 5])
tensor(4.0939, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -2.704643726348877), (380927, -5.763635635375977), (49

 90%|█████████ | 653/725 [21:16:33<3:11:14, 159.37s/it]

test：0.5, test mean: 0.3039647577092511
([7, 1, 1], [7, 1, 1])
tensor(12.2684, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.9181251525878906), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(9.1903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 58.43293762207031), (499999, 0.0), (499999, 0.0)]
([7, 1, 1], [7, 1, 1])
tensor(8.3042, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 46.767818450927734), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(7.6506, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 46.762489318847656), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(7.6506, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 46.75817108154297), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(7.6506, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 46.75386047363281), (499999, 0.0), (499999, 0.0)]
([0, 1, 1], [7, 1, 1])
tensor(7.6506, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 46.74

 90%|█████████ | 654/725 [21:20:58<3:45:56, 190.94s/it]

test：0.0, test mean: 0.3026315789473684
([8, 6, 1], [8, 6, 1])
tensor(18.4152, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 90%|█████████ | 655/725 [21:21:23<2:44:46, 141.24s/it]

([2, 7, 1], [2, 7, 1])
tensor(19.3211, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.7659730911254883), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(17.5251, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 9.73034954071045), (499999, 0.0)]
([2, 7, 1], [2, 7, 1])
tensor(13.4708, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 47.41679000854492), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(11.3781, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 47.19541931152344), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(11.3771, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 47.0955810546875), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(11.3769, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 47.03074264526367), (499999, 0.0)]
([2, 0, 1], [2, 7, 1])
tensor(11.3768, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 46.984657287597656), (499999

 90%|█████████ | 656/725 [21:26:03<3:30:11, 182.78s/it]

test：0.0, test mean: 0.30131004366812225
([6, 0, 1], [6, 0, 1])
tensor(7.9463, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.231741180163226e-07), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(7.9463, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, 6.708463047289115e-07), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(7.9463, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.7991475260714651e-06), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(7.9463, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.7990579408433405e-06), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(7.9463, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.7249617485504132e-06), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(7.9463, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (466943, -1.7247550658794353e-06), (499999, 0.0)]
([6, 0, 1], [6, 0, 1])
tensor(7.9463, device='cuda:0', grad_fn=<NllL

 91%|█████████ | 657/725 [21:30:36<3:57:55, 209.94s/it]

test：1.0, test mean: 0.30434782608695654
([0, 6, 2], [0, 6, 2])
tensor(15.7685, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 91%|█████████ | 658/725 [21:31:03<2:53:02, 154.96s/it]

([1, 5, 1], [1, 5, 1])
tensor(20.5615, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -2.943026065826416), (499999, 0.0)]
([1, 5, 1], [1, 5, 1])
tensor(20.3956, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -20.394071578979492), (499999, 0.0)]
([1, 5, 1], [1, 5, 1])
tensor(18.1790, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -46.87676239013672), (499999, 0.0)]
([1, 3, 1], [1, 5, 1])
tensor(17.3651, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -66.77754974365234), (499999, 0.0)]
([1, 0, 1], [1, 5, 1])
tensor(16.4089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -66.7796859741211), (499999, 0.0)]
([1, 0, 1], [1, 5, 1])
tensor(16.4089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -66.78292846679688), (499999, 0.0)]
([1, 0, 1], [1, 5, 1])
tensor(16.4089, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (479231, -66.78553771972656), 

 91%|█████████ | 659/725 [21:35:48<3:33:17, 193.90s/it]

test：0.0, test mean: 0.30303030303030304
([0, 1, 5], [1, 1, 5])
tensor(13.9787, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 91%|█████████ | 660/725 [21:36:14<2:35:39, 143.68s/it]

([3, 3, 1], [5, 3, 1])
tensor(11.8643, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 91%|█████████ | 661/725 [21:36:39<1:55:09, 107.96s/it]

([2, 2, 7], [2, 2, 7])
tensor(23.6659, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.9181621074676514)]
([2, 2, 7], [2, 2, 7])
tensor(21.3193, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 32.65496063232422)]
([2, 2, 7], [2, 2, 7])
tensor(20.2694, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 32.30670928955078)]
([2, 2, 0], [2, 2, 7])
tensor(17.0096, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 56.86933898925781)]
([2, 2, 0], [2, 2, 7])
tensor(16.9500, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 56.86170196533203)]
([2, 2, 0], [2, 2, 7])
tensor(16.9500, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 56.85378646850586)]
([2, 2, 0], [2, 2, 7])
tensor(16.9499, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 56.85256958

 91%|█████████▏| 662/725 [21:41:25<2:49:35, 161.52s/it]

test：0.0, test mean: 0.3017241379310345
([2, 2, 5], [2, 2, 5])
tensor(18.1618, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 91%|█████████▏| 663/725 [21:41:53<2:05:20, 121.30s/it]

([2, 2, 3], [2, 2, 5])
tensor(26.2042, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 92%|█████████▏| 664/725 [21:42:18<1:33:57, 92.42s/it] 

([1, 3, 7], [1, 1, 7])
tensor(10.4137, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 92%|█████████▏| 665/725 [21:42:43<1:12:08, 72.14s/it]

([8, 2, 0], [8, 2, 0])
tensor(19.4485, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 92%|█████████▏| 666/725 [21:43:09<57:19, 58.30s/it]  

([8, 1, 2], [8, 1, 2])
tensor(24.3304, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 92%|█████████▏| 667/725 [21:43:36<47:19, 48.96s/it]

([7, 0, 1], [7, 0, 1])
tensor(15.0409, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(13.1102, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.427509307861328), (499999, 0.0), (499999, 0.0)]
([7, 0, 1], [7, 0, 1])
tensor(11.3849, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.79576873779297), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(9.0464, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.897829055786133), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(9.0462, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 20.973791122436523), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(9.0462, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.04974937438965), (499999, 0.0), (499999, 0.0)]
([0, 0, 1], [7, 0, 1])
tensor(9.0461, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 21.107742309570312), (499999, 0.0), (49999

 92%|█████████▏| 668/725 [21:48:06<1:49:35, 115.36s/it]

test：0.0, test mean: 0.30042918454935624
([8, 8, 7], [8, 8, 7])
tensor(16.6553, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 2.918144941329956)]
([8, 8, 7], [8, 8, 7])
tensor(13.0100, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 54.3059196472168)]
([8, 8, 3], [8, 8, 7])
tensor(10.8903, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 49.114158630371094)]
([8, 8, 0], [8, 8, 7])
tensor(9.9895, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 48.74325942993164)]
([8, 8, 0], [8, 8, 7])
tensor(9.9892, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 48.46113586425781)]
([8, 8, 0], [8, 8, 7])
tensor(9.9890, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (368639, 48.20974349975586)]
([8, 8, 0], [8, 8, 7])
tensor(9.9888, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0),

 92%|█████████▏| 669/725 [21:53:09<2:40:10, 171.62s/it]

test：0.0, test mean: 0.29914529914529914
([1, 1, 7], [1, 1, 7])
tensor(16.1300, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 92%|█████████▏| 670/725 [21:53:36<1:57:39, 128.36s/it]

([7, 5, 2], [7, 5, 2])
tensor(14.8083, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 93%|█████████▎| 671/725 [21:54:01<1:27:31, 97.25s/it] 

([0, 1, 1], [0, 1, 1])
tensor(10.2941, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -7.435500144958496)]
([0, 1, 1], [0, 1, 1])
tensor(10.2959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -13.966899871826172)]
([0, 1, 1], [0, 1, 1])
tensor(10.2959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -20.49829864501953)]
([0, 1, 1], [0, 1, 1])
tensor(10.2959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -27.029701232910156)]
([0, 1, 1], [0, 1, 1])
tensor(10.2959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -33.561100006103516)]
([0, 1, 1], [0, 1, 1])
tensor(10.2959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -40.092498779296875)]
([0, 1, 1], [0, 1, 1])
tensor(10.2959, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -4

 93%|█████████▎| 672/725 [21:58:50<2:16:41, 154.74s/it]

test：1.0, test mean: 0.3021276595744681
([2, 0, 8], [2, 0, 8])
tensor(15.7471, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 93%|█████████▎| 673/725 [21:59:15<1:40:26, 115.90s/it]

([2, 2, 2], [2, 2, 2])
tensor(26.4796, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 93%|█████████▎| 674/725 [21:59:40<1:15:15, 88.55s/it] 

([2, 2, 2], [2, 2, 2])
tensor(27.7455, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 93%|█████████▎| 675/725 [22:00:08<58:41, 70.43s/it]  

([0, 1, 1], [0, 1, 1])
tensor(16.9749, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.004396188538521528)]


 93%|█████████▎| 676/725 [22:00:34<46:30, 56.95s/it]

([8, 7, 2], [8, 1, 2])
tensor(18.2174, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 93%|█████████▎| 677/725 [22:00:58<37:49, 47.27s/it]

([0, 1, 2], [0, 1, 2])
tensor(14.6929, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.00047449953854084015), (499999, 0.0), (499999, 0.0)]


 94%|█████████▎| 678/725 [22:01:23<31:40, 40.43s/it]

([2, 2, 7], [2, 2, 7])
tensor(23.2627, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.00956355594098568), (499999, 0.0), (380927, 2.9423112869262695)]
([2, 2, 7], [2, 2, 7])
tensor(21.7544, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.01912711188197136), (499999, 0.0), (380927, 14.443353652954102)]
([2, 2, 7], [2, 2, 7])
tensor(19.7199, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.02869066223502159), (499999, 0.0), (380927, -0.5130929946899414)]
([2, 2, 0], [2, 2, 7])
tensor(15.8657, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.03825422376394272), (499999, 0.0), (380927, -18.798843383789062)]
([2, 2, 0], [2, 2, 7])
tensor(15.7160, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.04781778156757355), (499999, 0.0), (380927, -19.822397232055664)]
([2, 2, 0], [2, 2, 7])
tensor(15.7146, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -0.05738133564591408), (499999, 0.0), (380927, -20.601884841918945)]
([2, 2, 0], [2, 2, 7])
t

 94%|█████████▎| 679/725 [22:06:05<1:26:32, 112.88s/it]

test：0.0, test mean: 0.3008474576271186
([0, 7, 1], [0, 7, 1])
tensor(13.0668, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([0, 7, 1], [0, 7, 1])
tensor(9.8783, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 18.719202041625977), (499999, 0.0)]
([0, 7, 1], [0, 7, 1])
tensor(9.2588, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 16.404529571533203), (499999, 0.0)]
([0, 7, 1], [0, 7, 1])
tensor(5.6106, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.4399595260620117), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(4.5524, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.434657096862793), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(4.5524, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.4293527603149414), (499999, 0.0)]
([0, 0, 1], [0, 7, 1])
tensor(4.5524, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0

 94%|█████████▍| 680/725 [22:10:34<1:59:53, 159.85s/it]

test：0.0, test mean: 0.29957805907172996
([1, 1, 0], [1, 1, 0])
tensor(8.5887, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 94%|█████████▍| 681/725 [22:11:00<1:27:37, 119.49s/it]

([6, 2, 2], [6, 2, 2])
tensor(18.1530, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 94%|█████████▍| 682/725 [22:11:26<1:05:38, 91.59s/it] 

([5, 7, 2], [5, 7, 2])
tensor(18.0090, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.9423105716705322), (499999, -0.009563576430082321)]
([5, 7, 2], [5, 7, 2])
tensor(15.6010, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 10.473665237426758), (499999, -0.019127152860164642)]
([5, 7, 2], [5, 7, 2])
tensor(12.7417, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -4.108218669891357), (499999, -0.028690727427601814)]
([5, 0, 2], [5, 7, 2])
tensor(10.8182, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -4.107034206390381), (499999, -0.038254305720329285)]
([5, 0, 2], [5, 7, 2])
tensor(10.8182, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -4.105849266052246), (499999, -0.04781787097454071)]
([5, 0, 2], [5, 7, 2])
tensor(10.8182, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, -4.104665279388428), (499999, -0.05738145858049393)]
([5, 0, 2], [5, 7, 2])
t

 94%|█████████▍| 683/725 [22:16:00<1:42:28, 146.38s/it]

test：0.0, test mean: 0.29831932773109243
([1, 8, 5], [1, 5, 5])
tensor(13.9865, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 94%|█████████▍| 684/725 [22:16:25<1:15:03, 109.84s/it]

([6, 0, 7], [6, 0, 7])
tensor(11.9106, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 2.712027072906494)]
([6, 0, 7], [6, 0, 7])
tensor(10.1627, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.391658782958984)]
([6, 0, 0], [6, 0, 7])
tensor(3.9861, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.733098030090332)]
([6, 0, 0], [6, 0, 7])
tensor(3.9843, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.805066108703613)]
([6, 0, 0], [6, 0, 7])
tensor(3.9837, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.829710006713867)]
([6, 0, 0], [6, 0, 7])
tensor(3.9837, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.853694915771484)]
([6, 0, 0], [6, 0, 7])
tensor(3.9835, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (380927, 15.862866401

 94%|█████████▍| 685/725 [22:21:21<1:50:28, 165.71s/it]

test：0.0, test mean: 0.29707112970711297
([8, 3, 0], [8, 3, 0])
tensor(12.0560, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 95%|█████████▍| 686/725 [22:21:46<1:20:15, 123.47s/it]

([2, 5, 1], [2, 5, 1])
tensor(23.1946, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.016278903931379318), (499999, 0.0)]


 95%|█████████▍| 687/725 [22:22:11<59:35, 94.08s/it]   

([0, 7, 3], [0, 7, 3])
tensor(10.1101, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9423112869262695), (499999, 0.0)]
([0, 7, 3], [0, 7, 3])
tensor(9.0580, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 21.058330535888672), (499999, 0.0)]
([0, 7, 3], [0, 7, 3])
tensor(7.2240, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.503479957580566), (499999, 0.0)]
([0, 0, 3], [0, 7, 3])
tensor(4.1591, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 9.598215103149414), (499999, 0.0)]
([0, 0, 3], [0, 7, 3])
tensor(4.1511, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 8.322198867797852), (499999, 0.0)]
([0, 0, 3], [0, 7, 3])
tensor(4.1462, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 7.505561828613281), (499999, 0.0)]
([0, 0, 3], [0, 7, 3])
tensor(4.1454, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 6.762848854064941), (499999, 0

 95%|█████████▍| 688/725 [22:26:40<1:30:20, 146.49s/it]

test：0.0, test mean: 0.29583333333333334
([1, 2, 2], [1, 2, 2])
tensor(25.5993, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -7.439916133880615), (499999, 0.0), (499999, 0.0)]
([1, 2, 2], [1, 2, 2])
tensor(25.7022, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -13.658427238464355), (499999, 0.0), (499999, 0.0)]
([1, 2, 2], [1, 2, 2])
tensor(25.7022, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -19.876937866210938), (499999, 0.0), (499999, 0.0)]
([1, 2, 2], [1, 2, 2])
tensor(25.7022, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -26.09545135498047), (499999, 0.0), (499999, 0.0)]
([1, 2, 2], [1, 2, 2])
tensor(25.7022, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -32.31396484375), (499999, 0.0), (499999, 0.0)]
([1, 2, 2], [1, 2, 2])
tensor(25.7022, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454655, -38.532474517822266), (499999, 0.0), (499999, 0.0)]
([1, 2, 2], [1, 2, 2])
tensor(25.7022, device='cuda:0', grad_fn=<NllLossBackward0>)
[(454

 95%|█████████▌| 689/725 [22:31:21<1:52:10, 186.95s/it]

test：1.0, test mean: 0.2987551867219917
([2, 2, 2], [2, 2, 2])
tensor(25.8609, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 95%|█████████▌| 690/725 [22:31:48<1:20:55, 138.72s/it]

([2, 3, 5], [2, 3, 5])
tensor(19.3429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.004345881287008524)]


 95%|█████████▌| 691/725 [22:32:13<59:24, 104.85s/it]  

([6, 5, 0], [6, 5, 0])
tensor(7.0996, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 95%|█████████▌| 692/725 [22:32:39<44:30, 80.93s/it] 

([8, 8, 5], [8, 8, 5])
tensor(9.7608, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -3.180683135986328)]
([8, 8, 5], [8, 8, 5])
tensor(9.4480, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -23.671817779541016)]
([8, 8, 3], [8, 8, 5])
tensor(8.2173, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -14.906909942626953)]
([8, 8, 0], [8, 8, 5])
tensor(7.5132, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -14.906869888305664)]
([8, 8, 0], [8, 8, 5])
tensor(7.5132, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -14.906824111938477)]
([8, 8, 0], [8, 8, 5])
tensor(7.5132, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -14.906822204589844)]
([8, 8, 0], [8, 8, 5])
tensor(7.5132, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (491519, -14.9068

 96%|█████████▌| 693/725 [22:37:29<1:16:39, 143.74s/it]

test：0.0, test mean: 0.2975206611570248
([1, 7, 2], [1, 7, 2])
tensor(26.6729, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(23.3526, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 20.855751037597656), (499999, 0.0)]
([1, 7, 2], [1, 7, 2])
tensor(21.6540, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.088642120361328), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(20.1760, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.087458610534668), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(20.1760, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.086276054382324), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(20.1760, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 5.085094451904297), (499999, 0.0)]
([1, 0, 2], [1, 7, 2])
tensor(20.1760, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 

 96%|█████████▌| 694/725 [22:42:21<1:37:13, 188.19s/it]

test：0.0, test mean: 0.2962962962962963
([7, 7, 5], [7, 7, 5])
tensor(19.1129, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (380927, 2.7659730911254883), (499999, 0.0)]
([7, 7, 5], [7, 7, 5])
tensor(14.1157, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 15.844284057617188), (380927, 19.531150817871094), (499999, 0.0)]
([7, 7, 5], [7, 7, 5])
tensor(9.8303, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.736513614654541), (380927, 82.45433044433594), (499999, 0.0)]
([0, 0, 5], [7, 7, 5])
tensor(4.7358, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.7020869255065918), (380927, 82.4248275756836), (499999, 0.0)]
([0, 0, 5], [7, 7, 5])
tensor(4.7358, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.667661190032959), (380927, 82.39714050292969), (499999, 0.0)]
([0, 0, 5], [7, 7, 5])
tensor(4.7358, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 0.6332359313964844), (380927, 82.36946868896484), (499999, 0.0)]
([0, 0, 5],

 96%|█████████▌| 695/725 [22:46:50<1:46:15, 212.53s/it]

test：0.0, test mean: 0.29508196721311475
([1, 2, 2], [1, 2, 2])
tensor(28.4454, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 96%|█████████▌| 696/725 [22:47:15<1:15:27, 156.12s/it]

([2, 1, 3], [2, 1, 3])
tensor(17.2873, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.005451403558254242), (499999, -0.002913515083491802)]


 96%|█████████▌| 697/725 [22:47:41<54:40, 117.15s/it]  

([2, 0, 3], [2, 0, 4])
tensor(11.0832, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 96%|█████████▋| 698/725 [22:48:07<40:27, 89.89s/it] 

([5, 2, 2], [5, 2, 2])
tensor(15.4788, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 96%|█████████▋| 699/725 [22:48:35<30:52, 71.26s/it]

([7, 2, 2], [7, 2, 2])
tensor(25.3976, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9423112869262695), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(23.7838, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 8.296073913574219), (499999, 0.0), (499999, 0.0)]
([7, 2, 2], [7, 2, 2])
tensor(21.5093, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.135575771331787), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(17.5092, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.451663970947266), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(17.4642, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.2932868003845215), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(17.4639, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.3161821365356445), (499999, 0.0), (499999, 0.0)]
([0, 2, 2], [7, 2, 2])
tensor(17.4637, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 5.341409683227539), (499999, 0.0), (499

 97%|█████████▋| 700/725 [22:53:12<55:28, 133.15s/it]

test：0.0, test mean: 0.2938775510204082
([1, 0, 0], [1, 0, 0])
tensor(5.7021, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 97%|█████████▋| 701/725 [22:53:38<40:20, 100.87s/it]

([1, 1, 0], [1, 1, 0])
tensor(9.0984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.000386392988730222)]
([1, 1, 0], [1, 1, 0])
tensor(9.0984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.0003124058712273836)]
([1, 1, 0], [1, 1, 0])
tensor(9.0984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.0002944853622466326)]
([1, 1, 0], [1, 1, 0])
tensor(9.0984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.00029448792338371277)]
([1, 1, 0], [1, 1, 0])
tensor(9.0984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.0002944905136246234)]
([1, 1, 0], [1, 1, 0])
tensor(9.0984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (479231, -0.0002944929292425513)]
([1, 1, 0], [1, 1, 0])
tensor(9.0984, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0)

 97%|█████████▋| 702/725 [22:58:36<1:01:22, 160.10s/it]

test：1.0, test mean: 0.2967479674796748
([8, 3, 6], [8, 3, 6])
tensor(13.9429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 97%|█████████▋| 703/725 [22:59:05<44:12, 120.58s/it]  

([8, 1, 3], [7, 1, 3])
tensor(14.0679, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, -0.0009988413657993078)]


 97%|█████████▋| 704/725 [22:59:30<32:15, 92.14s/it] 

([7, 2, 3], [7, 2, 3])
tensor(13.9338, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.69321870803833), (499999, 0.0), (479231, -16.20624351501465)]
([7, 2, 3], [7, 2, 3])
tensor(11.1714, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 46.6148681640625), (499999, 0.0), (479231, -26.75152015686035)]
([0, 2, 0], [7, 2, 3])
tensor(8.8147, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 72.22415924072266), (499999, 0.0), (479231, -27.148468017578125)]
([0, 2, 0], [7, 2, 3])
tensor(8.5781, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 72.22413635253906), (499999, 0.0), (479231, -27.038185119628906)]
([0, 2, 0], [7, 2, 3])
tensor(8.5666, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 72.22413635253906), (499999, 0.0), (479231, -27.013118743896484)]
([0, 2, 0], [7, 2, 3])
tensor(8.5611, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 72.22393035888672), (499999, 0.0), (479231, -26.985450744628906)]
([0, 2, 0], [7, 2, 3])
tensor(8.5580, device='cu

 97%|█████████▋| 705/725 [23:04:12<49:39, 149.00s/it]

test：0.0, test mean: 0.29554655870445345
([2, 2, 7], [2, 2, 7])
tensor(26.8148, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0312858447432518), (380927, 2.7659730911254883)]
([2, 2, 7], [2, 2, 7])
tensor(24.4563, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0625716894865036), (380927, 7.946239471435547)]
([2, 2, 7], [2, 2, 7])
tensor(19.7511, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0938575342297554), (380927, 61.371559143066406)]
([2, 2, 0], [2, 2, 7])
tensor(18.3225, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.1251433789730072), (380927, 61.111244201660156)]
([2, 2, 0], [2, 2, 7])
tensor(18.3219, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.1564292311668396), (380927, 60.86906433105469)]
([2, 2, 0], [2, 2, 7])
tensor(18.3203, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.1877150535583496), (380927, 60.816184997558594)]
(

 97%|█████████▋| 706/725 [23:09:22<1:02:27, 197.22s/it]

test：0.0, test mean: 0.29435483870967744
([2, 2, 0], [2, 2, 0])
tensor(16.3135, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 707/725 [23:09:48<43:45, 145.83s/it]  

([2, 2, 3], [2, 2, 3])
tensor(23.7071, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 708/725 [23:10:16<31:19, 110.55s/it]

([2, 2, 5], [2, 2, 5])
tensor(22.0702, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, -0.008453618735074997), (499999, 0.0)]


 98%|█████████▊| 709/725 [23:10:42<22:43, 85.19s/it] 

([8, 7, 5], [8, 7, 2])
tensor(11.5468, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 2.9181621074676514), (499999, 0.0)]
([8, 7, 5], [8, 7, 2])
tensor(9.5886, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 16.938064575195312), (499999, 0.0)]
([8, 7, 5], [8, 7, 2])
tensor(7.8107, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.283222198486328), (499999, 0.0)]
([8, 0, 5], [8, 7, 2])
tensor(5.4732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.285614013671875), (499999, 0.0)]
([8, 0, 5], [8, 7, 2])
tensor(5.4732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.287803649902344), (499999, 0.0)]
([8, 0, 5], [8, 7, 2])
tensor(5.4732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.289993286132812), (499999, 0.0)]
([8, 0, 5], [8, 7, 2])
tensor(5.4732, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (380927, 11.292181015014648), (49999

 98%|█████████▊| 710/725 [23:15:25<36:06, 144.46s/it]

test：0.0, test mean: 0.2931726907630522
([1, 7, 2], [1, 7, 2])
tensor(14.9594, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 711/725 [23:15:52<25:29, 109.22s/it]

([1, 2, 1], [1, 2, 1])
tensor(24.9081, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0009793126955628395)]


 98%|█████████▊| 712/725 [23:16:18<18:15, 84.30s/it] 

([8, 3, 2], [8, 3, 2])
tensor(25.3103, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 98%|█████████▊| 713/725 [23:16:44<13:20, 66.72s/it]

([7, 7, 1], [7, 7, 1])
tensor(13.9404, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 2.6932425498962402), (499999, 0.0)]
([7, 7, 1], [7, 7, 1])
tensor(10.8149, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 27.69367790222168), (499999, 0.0)]
([7, 3, 1], [7, 7, 1])
tensor(8.0393, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 5.7262725830078125), (499999, 0.0)]
([7, 0, 1], [7, 7, 1])
tensor(7.4363, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 5.73738956451416), (499999, 0.0)]
([7, 0, 1], [7, 7, 1])
tensor(7.4362, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 5.749198913574219), (499999, 0.0)]
([7, 0, 1], [7, 7, 1])
tensor(7.4359, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 5.749205589294434), (499999, 0.0)]
([7, 0, 1], [7, 7, 1])
tensor(7.4359, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (368639, 5.749207496643066), (499999, 0.

 98%|█████████▊| 714/725 [23:21:21<23:48, 129.88s/it]

test：0.0, test mean: 0.292
([2, 1, 2], [2, 1, 2])
tensor(27.5609, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 99%|█████████▊| 715/725 [23:21:47<16:26, 98.60s/it] 

([1, 1, 1], [1, 1, 1])
tensor(19.9907, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 99%|█████████▉| 716/725 [23:22:11<11:28, 76.47s/it]

([3, 1, 5], [3, 1, 5])
tensor(16.3413, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -16.574087142944336), (499999, 0.0), (499999, 0.01073096040636301)]
([3, 1, 5], [3, 1, 5])
tensor(15.8667, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, -24.513708114624023), (499999, 0.0), (499999, 0.02146192081272602)]
([3, 1, 5], [3, 1, 5])
tensor(14.1798, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 67.62936401367188), (499999, 0.0), (499999, 0.03219287842512131)]
([0, 1, 5], [3, 1, 5])
tensor(10.5369, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 68.12836456298828), (499999, 0.0), (499999, 0.04292384162545204)]
([0, 1, 5], [3, 1, 5])
tensor(10.5342, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 68.37242126464844), (499999, 0.0), (499999, 0.05365479737520218)]
([0, 1, 5], [3, 1, 5])
tensor(10.5301, device='cuda:0', grad_fn=<NllLossBackward0>)
[(405503, 68.37874603271484), (499999, 0.0), (499999, 0.06438575685024261)]
([0, 1, 5], [3, 1, 5])
tensor(10.530

 99%|█████████▉| 717/725 [23:26:52<18:21, 137.66s/it]

test：0.0, test mean: 0.2908366533864542
([1, 2, 0], [1, 2, 0])
tensor(16.6488, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 99%|█████████▉| 718/725 [23:27:17<12:08, 104.03s/it]

([2, 6, 0], [2, 6, 0])
tensor(11.6731, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


 99%|█████████▉| 719/725 [23:27:41<08:00, 80.03s/it] 

([0, 0, 2], [0, 1, 2])
tensor(7.4429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, -2.8150127036496997e-05), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 1, 2])
tensor(7.4429, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.00011321192141622305), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 1, 2])
tensor(7.4428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.00013782150926999748), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 1, 2])
tensor(7.4428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.00013735699758399278), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 1, 2])
tensor(7.4428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.00013712386135011911), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 1, 2])
tensor(7.4428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.0001363444171147421), (499999, 0.0), (499999, 0.0)]
([0, 0, 2], [0, 1, 2])
tensor(7.4428, device='cuda:0', grad_fn=<NllLossBackward0>)
[(442367, 0.00013659342948812

 99%|█████████▉| 720/725 [23:32:10<11:23, 136.76s/it]

test：1.0, test mean: 0.29365079365079366
([2, 0, 0], [2, 0, 0])
tensor(11.7522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 9.336588391306577e-07), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(11.7522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 1.2996508303331211e-05), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(11.7522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 1.2996911209484097e-05), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(11.7522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 1.2997314115636982e-05), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(11.7522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 1.2997716112295166e-05), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(11.7522, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (454655, 1.299809810006991e-05), (499999, 0.0)]
([2, 0, 0], [2, 0, 0])
tensor(11.7522, device='cuda:0', grad_fn=<Nl

 99%|█████████▉| 721/725 [23:36:48<11:56, 179.05s/it]

test：1.0, test mean: 0.2964426877470356
([2, 0, 3], [2, 0, 3])
tensor(11.7938, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, 0.0), (499999, 0.0), (499999, 0.0)]


100%|█████████▉| 722/725 [23:37:14<06:39, 133.09s/it]

([7, 0, 7], [7, 0, 7])
tensor(16.9921, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.9181621074676514), (499999, 0.0), (380927, 2.9181621074676514)]
([7, 0, 7], [7, 0, 7])
tensor(9.2244, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 14.598701477050781), (499999, 0.0), (380927, 19.336936950683594)]
([7, 0, 7], [7, 0, 7])
tensor(8.1431, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 2.423943519592285), (499999, 0.0), (380927, 11.327884674072266)]
([7, 0, 0], [7, 0, 7])
tensor(0.3285, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 17.529184341430664), (499999, 0.0), (380927, 13.003251075744629)]
([0, 0, 0], [7, 0, 7])
tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 17.518529891967773), (499999, 0.0), (380927, 13.034564018249512)]
([0, 0, 0], [7, 0, 7])
tensor(0.0007, device='cuda:0', grad_fn=<NllLossBackward0>)
[(380927, 17.507877349853516), (499999, 0.0), (380927, 13.062674522399902)]
([0, 0, 0], [7, 0, 7])
tensor(0.0006, device='

100%|█████████▉| 723/725 [23:42:02<05:58, 179.40s/it]

test：0.0, test mean: 0.2952755905511811
([7, 2, 0], [7, 2, 0])
tensor(16.2474, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 2.7659714221954346), (499999, 0.0), (442367, 4.592854565999005e-06)]
([7, 2, 0], [7, 2, 0])
tensor(13.8536, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 20.05219268798828), (499999, 0.0), (442367, 0.00017068447778001428)]
([7, 2, 0], [7, 2, 0])
tensor(10.0640, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 48.10798645019531), (499999, 0.0), (442367, 0.00016925446107052267)]
([0, 2, 0], [7, 2, 0])
tensor(9.7079, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 48.1080322265625), (499999, 0.0), (442367, 0.00016924337251111865)]
([0, 2, 0], [7, 2, 0])
tensor(9.7079, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 48.10807800292969), (499999, 0.0), (442367, 0.00016923289513215423)]
([0, 2, 0], [7, 2, 0])
tensor(9.7079, device='cuda:0', grad_fn=<NllLossBackward0>)
[(368639, 48.108123779296875), (499999, 0.0), (442367, 0.00016926

100%|█████████▉| 724/725 [23:46:55<03:33, 213.75s/it]

test：0.5, test mean: 0.296078431372549
([0, 2], [0, 2])
tensor(13.4703, device='cuda:0', grad_fn=<NllLossBackward0>)
[(499999, -1.418677857145667e-05), (499999, 0.0)]


100%|██████████| 725/725 [23:47:17<00:00, 118.12s/it]


In [15]:
# if __name__=='__main__':
#     acc = []
#     preds = []
#     labels = []

#     for step, val_batch_data in enumerate(validloader):
#         print(step,len(validloader))
#         cur_batch_size = val_batch_data[0].size(0)

#         exe_input = val_batch_data[0].to(device) if use_gpu else val_batch_data[0]

#         label = val_batch_data[1].to(device) if use_gpu else val_batch_data[1]
#         fake_label = torch.zeros_like(label.squeeze())
#         # fake_label[fake_label == 9] = 0
#         # temp = torch.zeros((6,9))
#         # for i,j in enumerate(fake_label): temp[i][j] = 1
#         # fake_label = temp.cuda()
#         # fake_label = fake_label.reshape(-1,1)
#         label = label.squeeze() - 1
        
#         pred,temp = malconv(exe_input,ce_loss,fake_label,label)
        
#         pred = np.argmax(pred,1)
#         label = label.cpu().data.numpy().astype(int)
#         # temp_grad,temp = malconv(exe_input)
#         preds.extend(pred.tolist())
#         labels.extend(label.tolist())

#         val_Macc = (label == pred).sum()
#         val_Macc = val_Macc / cur_batch_size
#         acc.append(val_Macc)
#         print(f"test：{val_Macc}, test mean: {np.mean(acc)}")