In [90]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import pandas as pd

import torch.optim as optim
import time
from opacus import PrivacyEngine
from vantage6.tools.util import info, warn
from torchvision import transforms
import argparse
from torchvision import datasets, transforms

In [91]:
# simple model 

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)
        self.dropout2 = nn.Dropout(0.5)
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = self.conv2(x)
        x = F.relu(x)
        x = F.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = F.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = F.log_softmax(x, dim=1)
        return output


In [92]:
model = Net()

for parameter in model.parameters():
    print(parameter)

Parameter containing:
tensor([[[[ 2.6659e-01, -1.8754e-01,  1.4056e-01],
          [-2.6799e-01,  2.4364e-02,  3.1870e-01],
          [ 3.1865e-02, -1.6704e-01, -4.5359e-02]]],


        [[[-2.2608e-01, -7.3469e-02, -1.9443e-01],
          [-3.0724e-01,  2.8450e-01,  7.1407e-02],
          [-1.9219e-01,  3.1421e-02, -2.4110e-01]]],


        [[[-6.7427e-02, -3.0053e-01,  3.2676e-01],
          [ 2.8281e-01,  4.3600e-02, -3.0157e-04],
          [ 3.0820e-01, -1.8431e-01,  4.6132e-02]]],


        [[[-2.9954e-01,  2.8935e-02,  3.5751e-02],
          [ 2.0908e-01, -3.1655e-01,  7.6181e-03],
          [-3.0553e-01, -2.4606e-01,  2.1488e-01]]],


        [[[-2.1892e-03, -6.4557e-02, -5.4479e-02],
          [-3.0532e-01,  3.0861e-01, -1.8689e-01],
          [-1.9416e-02,  5.9372e-02,  1.3005e-01]]],


        [[[-6.1848e-02, -2.9730e-01, -7.9388e-02],
          [ 2.2878e-01,  2.4139e-01, -1.5242e-01],
          [ 4.6400e-02, -2.9634e-01, -2.5943e-01]]],


        [[[ 2.1308e-01,  2.5997e-01,

In [93]:
parameters = model.parameters()

i = 0
with torch.no_grad():
    for param in model.parameters():
        # The first entry of the provided parameters when using dist.gather
        # method also contains the value from the server, remove that one
        minus_server = parameters[i][1:]
        # Calculate the average by summing and dividing by the number of
        # workers
        s = sum(minus_server)
        average = s/len(minus_server)
        # Change the parameter of the global model to the average
        param.data = average
        i = i + 1

TypeError: 'generator' object is not subscriptable

In [94]:
# initialises training

def RPC_initialize_training(data, gamma, learning_rate, local_dp):
    """
    Initializes the model, optimizer and scheduler and shares the parameters
    with all the workers in the group.

    This should be sent from server to all nodes.

    Args:
        data: contains the local data from the node
        gamma: Learning rate step gamma (default: 0.7)
        learning_rate: The learning rate for training.
        cuda: Should we use CUDA?
        local_dp: bool whether to apply local_dp or not.

    Returns:
        Returns the device, model, optimizer and scheduler.
    """
    
    # Determine the device to train on
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # print("\033[0;{};49m Rank {} is training on {}".format(device))

    # Initialize model and send parameters of server to all workers
    model = Net()
    model.to(device)

    # intializing optimizer and scheduler
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    # adding DP if true
    if local_dp == True:
        privacy_engine = PrivacyEngine(model, batch_size=64,
                sample_size=60000, alphas=range(2,32), noise_multiplier=1.3,
                max_grad_norm=1.0,)
        privacy_engine.attach(optimizer)

    # returns device, model, optimizer which will be needed in train and test
    return device, model, optimizer    

In [96]:
# basic training of the model

# Question: train gets model, device, optimizer from initialize_training, which is specified within train function, 
# why do I need to call it again before executing the function? Because in vantage6 when I sent the tasks I cannot define that but only in the master function


def RPC_train(data, log_interval, local_dp, epoch, round, delta=1e-5):
    """
    Training the model on all batches.
    Args:
        epoch: The number of the epoch the training is in.
        round: The number of the round the training is in.
        log_interval: The amount of rounds before logging intermediate loss.
        local_dp: Training with local DP?
        delta: The delta value of DP to aim for (default: 1e-5).
    """
    # loading arguments/parameters from first RPC_method
    device, model, optimizer = RPC_initialize_training(data, gamma, learning_rate, local_dp) # is this allowed in vantage6? calling one RPC_method in another?
            
    model.train()
# , (data, target)
    for batch_idx, data in enumerate(data,0): 
        
#         batch = (data, target)
        data, target = data
        # Send the data and target to the device (cpu/gpu) the model is at
#         data, target = data.to(device), target.to(device)
        # Clear gradient buffers
        optimizer.zero_grad()
        # Run the model on the data
        output = model(data)
        # Calculate the loss
        loss = F.nll_loss(output, target)
        # Calculate the gradients
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print(loss)
#     # Update the model weights
#     if train:
#         optimizer.step()
#     return loss

#         if batch_idx % log_interval == 0:
#             print('\033[0;{};49m Train on Rank {}, Round {}, Epoch {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#             round, epoch, batch_idx * len(batch[0]), len(train_loader.dataset),
#             100. * batch_idx / len(train_loader), loss.item()))
        

    # Adding differential privacy or not
    if local_dp == True:  
        epsilon, alpha = optimizer.privacy_engine.get_privacy_spent(delta)
            # print("\033[0;{};49m Epsilon {}, best alpha {}".format(epsilon, alpha))
     
    
    
#     running_loss = 0.0
#     for i, data in enumerate(trainloader, 0):
#         # get the inputs; data is a list of [inputs, labels]
#         inputs, labels = data

#         # zero the parameter gradients
#         optimizer.zero_grad()

#         # forward + backward + optimize
#         outputs = net(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         # print statistics
#         running_loss += loss.item()
#         if i % 2000 == 1999:    # print every 2000 mini-batches
#             print('[%d, %5d] loss: %.3f' %
#                   (epoch + 1, i + 1, running_loss / 2000))
#             running_loss = 0.0



# # This function trains the neural network for one epoch
# def train(epoch):
#     model.train()
#     for batch_idx, (data, target) in enumerate(train_loader):
#         # Move the input and target data on the GPU
#         data, target = data.to(device), target.to(device)
#         # Zero out gradients from previous step
#         optimizer.zero_grad()
#         # Forward pass of the neural net
#         output = model(data)
#         # Calculation of the loss function
#         loss = F.nll_loss(output, target)
#         # Backward pass (gradient computation)
#         loss.backward()
#         # Adjusting the parameters according to the loss function
#         optimizer.step()
#         if batch_idx % 10 == 0:
#             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
#                 epoch, batch_idx * len(data), len(train_loader.dataset),
#                 100. * batch_idx / len(train_loader), loss.item()))


In [None]:
# Model Evaluation

def RPC_test(data):
    """
    Tests the model.

    Args:
        color: The color for the terminal output for this worker.
        model: The model to test.
        device: The device to test the model on.
        test_loader: The local loader for test local. -> no inside function
    """


#     test_loader = torch.load("./testing.pt")
    test_loader = torch.utils.data.DataLoader(datasets.MNIST('../mnist_data', 
                                                          download=True, 
                                                              train=False,
                                                          transform=transforms.Compose([
                                                              transforms.ToTensor(), # first, convert image to PyTorch tensor
                                                              transforms.Normalize((0.1307,), (0.3081,)) # normalize inputs
                                                          ])), 
                                           batch_size=10, 
                                           shuffle=True)

    device, model, optimizer = RPC_initialize_training(data, gamma, learning_rate, local_dp)

    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            # Send the local and target to the device (cpu/gpu) the model is at
            data, target = data.to(device), target.to(device)
            # Run the model on the local
            output = model(data)
            # Calculate the loss
            test_loss += F.nll_loss(output, target, reduction='sum').item()
            # Check whether prediction was correct
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print(test_loss)

    # print('\033[0;{};49m \nAverage loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
    #         test_loss, correct, len(test_loader.dataset),
    #         100. * correct / len(test_loader.dataset)))

In [None]:
# FedAvg gathering of parameters 

def RPC_get_parameters(data, model, parameters):
    """
    Get parameters from nodes
    """
    data_size = len(data) // 3 # number of nodes# size of dataset
    
    weights = []
    # Gather the data sizes on the server
    tensor_weights = torch.tensor(data_size)
    tensor_weights = tensor_weights[1:]
    # Convert all tensors back to weights
    for tensor in tensor_weights:
            weights.append(tensor.item())

            
    for parameters in model.parameters():
        return {
            "params": parameters,
        }


"""
this might need to be combined with training, so that train 
returns the parameters or that it at least calls the results of training function
"""



In [36]:
# averaging of returned parameters 

def average_parameters(data, model):
    """
    Get parameters from nodes and calculate the average
    :param model: torch model
    :param parameters: parameters of model
    :param weights:
    :return:
    """
    
    parameters = RPC_get_parameters() # makes returned parameters from RPC_get_parameters the parameters used in this function

    # TODO: local: since we usually just get the parameters, this well be an entire task, therefore, we might need to train for each individually


    with torch.no_grad():
        for parameters in model.parameters():
            average = sum(x * y for x, y in zip(parameters[i], weights)) / sum(weights)
            parameters.data = average
            i = i + 1
        return {
            "params_averaged": model
        }
    

#     i = 0
#     with torch.no_grad():
#     for param in model.parameters():
#     # The first entry of the provided parameters when using dist.gather
#     # method also contains the value from the server, remove that one
#     minus_server = parameters[i][1:]
#     # Calculate the average by summing and dividing by the number of
#     # workers
#     s = sum(minus_server)
#     average = s/len(minus_server)
#     # Change the parameter of the global model to the average
#     param.data = average
#     i = i + 1

In [None]:
# training with those averaged parameters

def RPC_fed_avg(data, local_dp, model, device, optimizer, epoch, delta=1e-5):
    """
    Training and testing the model on the workers concurrently using federated
    averaging, which means calculating the average of the local model
    parameters after a number of (local) epochs each training round.

    In vantage6, this method will be the training of the model with the average parameters (weighted)

    Returns:
        Returns the final model
    """
    # TODO: local: since we usually just get the parameters, this well be an entire task, therefore, we might need to train for each individually
    model = RPC_average_parameters()
    
    for epoch in range(1, epoch + 1):
        # Train the model on the workers again
        RPC_train(data, local_dp, model, device, optimizer, epoch, delta=1e-5)
        # Test the model on the workers
        RPC_test(data, model, device)

    gather_params = model.get_parameters() # or model.parameters()

    RPC_train(model.RPC_average_parameters_weighted(gather_params))

    return model, parameters


    ## OR 

#     parameters = RPC_average_parameters_weighted(data, model, parameters, weights) # then uses those parameters for training



        # # Gather the parameters after the training round on the server
        #     gather_params = coor.gather_parameters(rank, model, group_size + 1, subgroup)
        #
        #     # If the server
        #     if rank == 0:
        #         # Calculate the average of the parameters and adjust global model
        #         coor.average_parameters_weighted(model, gather_params, weights)
        #
        #     # Send the new model parameters to the workers
        #     coor.broadcast_parameters(model, group)


In [97]:
"""
These are the parameters needed for the function
Data loading and transforming (this will be done beforehand 
and then stored in './local/training.pt' and './testing.pt')
"""

learning_rate=0.01

gamma=0.7

# data_loader = torch.utils.data.DataLoader(datasets.MNIST('../mnist_data', 
#                                                           download=True, 
#                                                           transform=transforms.Compose([
#                                                               transforms.ToTensor(), # first, convert image to PyTorch tensor
#                                                               transforms.Normalize((0.1307,), (0.3081,)) # normalize inputs
#                                                           ])), 
#                                            batch_size=10, 
#                                            shuffle=True)
data = pd.read_csv("/Users/simontokloth/PycharmProjects/torch-vantage6/v6-ppsdg-py/local/mnist_train.csv")
data = data_loader


local_dp = True


# optimizer = optim.SGD(model.parameters(), lr=learning_rate)

epoch = 1

round = 1

"""
TODO, the RPC_initialize_training method is 
called in the RPC_train method, 
and yet it doesn't know where to get model, device, optimizer from. 
"""


"\nTODO, the RPC_initialize_training method is \ncalled in the RPC_train method, \nand yet it doesn't know where to get model, device, optimizer from. \n"

In [98]:
# local_dp = False
# RPC_initialize_training(data, gamma, learning_rate, local_dp)

log_interval = 5

# use_cuda = torch.cuda.is_available()
# device = torch.device("cuda" if use_cuda else "cpu")

# model = Net()
# optimizer = optim.SGD(model.parameters(), lr=learning_rate)

RPC_train(data, log_interval, local_dp, epoch, round)

# RPC_test(data)


  "The sample rate will be defined from ``batch_size`` and ``sample_size``."
  "Secure RNG turned off. This is perfectly fine for experimentation as it allows "


tensor(2.2792, grad_fn=<NllLossBackward>)
tensor(2.3574, grad_fn=<NllLossBackward>)
tensor(2.3095, grad_fn=<NllLossBackward>)
tensor(2.3172, grad_fn=<NllLossBackward>)
tensor(2.3023, grad_fn=<NllLossBackward>)
tensor(2.2945, grad_fn=<NllLossBackward>)
tensor(2.2522, grad_fn=<NllLossBackward>)
tensor(2.3519, grad_fn=<NllLossBackward>)
tensor(2.2895, grad_fn=<NllLossBackward>)
tensor(2.2183, grad_fn=<NllLossBackward>)
tensor(2.2276, grad_fn=<NllLossBackward>)
tensor(2.2940, grad_fn=<NllLossBackward>)
tensor(2.2814, grad_fn=<NllLossBackward>)
tensor(2.2439, grad_fn=<NllLossBackward>)
tensor(2.2546, grad_fn=<NllLossBackward>)
tensor(2.2976, grad_fn=<NllLossBackward>)
tensor(2.1888, grad_fn=<NllLossBackward>)
tensor(2.3296, grad_fn=<NllLossBackward>)
tensor(2.2411, grad_fn=<NllLossBackward>)
tensor(2.2261, grad_fn=<NllLossBackward>)
tensor(2.3369, grad_fn=<NllLossBackward>)
tensor(2.2615, grad_fn=<NllLossBackward>)
tensor(2.3089, grad_fn=<NllLossBackward>)
tensor(2.3935, grad_fn=<NllLossBac

tensor(2.0018, grad_fn=<NllLossBackward>)
tensor(1.7338, grad_fn=<NllLossBackward>)
tensor(2.2756, grad_fn=<NllLossBackward>)
tensor(1.6204, grad_fn=<NllLossBackward>)
tensor(2.3701, grad_fn=<NllLossBackward>)
tensor(2.4491, grad_fn=<NllLossBackward>)
tensor(1.6924, grad_fn=<NllLossBackward>)
tensor(1.6930, grad_fn=<NllLossBackward>)
tensor(2.5315, grad_fn=<NllLossBackward>)
tensor(2.1798, grad_fn=<NllLossBackward>)
tensor(2.3100, grad_fn=<NllLossBackward>)
tensor(2.5051, grad_fn=<NllLossBackward>)
tensor(2.5204, grad_fn=<NllLossBackward>)
tensor(2.4790, grad_fn=<NllLossBackward>)
tensor(1.8334, grad_fn=<NllLossBackward>)
tensor(2.2960, grad_fn=<NllLossBackward>)
tensor(2.0924, grad_fn=<NllLossBackward>)
tensor(1.9815, grad_fn=<NllLossBackward>)
tensor(2.1073, grad_fn=<NllLossBackward>)
tensor(2.5139, grad_fn=<NllLossBackward>)
tensor(2.1012, grad_fn=<NllLossBackward>)
tensor(1.9144, grad_fn=<NllLossBackward>)
tensor(2.3274, grad_fn=<NllLossBackward>)
tensor(1.7865, grad_fn=<NllLossBac

tensor(2.6395, grad_fn=<NllLossBackward>)
tensor(3.4726, grad_fn=<NllLossBackward>)
tensor(3.4479, grad_fn=<NllLossBackward>)
tensor(1.9493, grad_fn=<NllLossBackward>)
tensor(2.6623, grad_fn=<NllLossBackward>)
tensor(2.1857, grad_fn=<NllLossBackward>)
tensor(2.5026, grad_fn=<NllLossBackward>)
tensor(1.7021, grad_fn=<NllLossBackward>)
tensor(2.9951, grad_fn=<NllLossBackward>)
tensor(3.0750, grad_fn=<NllLossBackward>)
tensor(1.5774, grad_fn=<NllLossBackward>)
tensor(2.0389, grad_fn=<NllLossBackward>)
tensor(2.2669, grad_fn=<NllLossBackward>)
tensor(2.1963, grad_fn=<NllLossBackward>)
tensor(2.5257, grad_fn=<NllLossBackward>)
tensor(2.2628, grad_fn=<NllLossBackward>)
tensor(2.6074, grad_fn=<NllLossBackward>)
tensor(2.7638, grad_fn=<NllLossBackward>)
tensor(1.7077, grad_fn=<NllLossBackward>)
tensor(2.5327, grad_fn=<NllLossBackward>)
tensor(2.6098, grad_fn=<NllLossBackward>)
tensor(0.9482, grad_fn=<NllLossBackward>)
tensor(2.7040, grad_fn=<NllLossBackward>)
tensor(2.4828, grad_fn=<NllLossBac

tensor(5.0032, grad_fn=<NllLossBackward>)
tensor(3.3972, grad_fn=<NllLossBackward>)
tensor(3.5194, grad_fn=<NllLossBackward>)
tensor(2.9693, grad_fn=<NllLossBackward>)
tensor(2.3804, grad_fn=<NllLossBackward>)
tensor(2.3848, grad_fn=<NllLossBackward>)
tensor(4.8345, grad_fn=<NllLossBackward>)
tensor(3.8324, grad_fn=<NllLossBackward>)
tensor(2.1018, grad_fn=<NllLossBackward>)
tensor(1.8906, grad_fn=<NllLossBackward>)
tensor(2.4558, grad_fn=<NllLossBackward>)
tensor(3.7404, grad_fn=<NllLossBackward>)
tensor(2.8419, grad_fn=<NllLossBackward>)
tensor(3.5295, grad_fn=<NllLossBackward>)
tensor(4.5646, grad_fn=<NllLossBackward>)
tensor(3.1757, grad_fn=<NllLossBackward>)
tensor(1.1621, grad_fn=<NllLossBackward>)
tensor(4.4467, grad_fn=<NllLossBackward>)
tensor(4.0937, grad_fn=<NllLossBackward>)
tensor(2.6682, grad_fn=<NllLossBackward>)
tensor(2.9368, grad_fn=<NllLossBackward>)
tensor(3.4109, grad_fn=<NllLossBackward>)
tensor(1.9129, grad_fn=<NllLossBackward>)
tensor(2.8552, grad_fn=<NllLossBac

tensor(2.3287, grad_fn=<NllLossBackward>)
tensor(2.8887, grad_fn=<NllLossBackward>)
tensor(2.2884, grad_fn=<NllLossBackward>)
tensor(5.3037, grad_fn=<NllLossBackward>)
tensor(1.3674, grad_fn=<NllLossBackward>)
tensor(1.8158, grad_fn=<NllLossBackward>)
tensor(2.1581, grad_fn=<NllLossBackward>)
tensor(4.4635, grad_fn=<NllLossBackward>)
tensor(2.4012, grad_fn=<NllLossBackward>)
tensor(0.9607, grad_fn=<NllLossBackward>)
tensor(4.0572, grad_fn=<NllLossBackward>)
tensor(3.8335, grad_fn=<NllLossBackward>)
tensor(3.4396, grad_fn=<NllLossBackward>)
tensor(4.9905, grad_fn=<NllLossBackward>)
tensor(2.2768, grad_fn=<NllLossBackward>)
tensor(5.8426, grad_fn=<NllLossBackward>)
tensor(4.2964, grad_fn=<NllLossBackward>)
tensor(4.1688, grad_fn=<NllLossBackward>)
tensor(4.2137, grad_fn=<NllLossBackward>)
tensor(1.7830, grad_fn=<NllLossBackward>)
tensor(2.9328, grad_fn=<NllLossBackward>)
tensor(4.3001, grad_fn=<NllLossBackward>)
tensor(3.2685, grad_fn=<NllLossBackward>)
tensor(2.0514, grad_fn=<NllLossBac

tensor(2.5538, grad_fn=<NllLossBackward>)
tensor(3.6719, grad_fn=<NllLossBackward>)
tensor(4.2459, grad_fn=<NllLossBackward>)
tensor(6.1631, grad_fn=<NllLossBackward>)
tensor(3.5049, grad_fn=<NllLossBackward>)
tensor(1.8510, grad_fn=<NllLossBackward>)
tensor(2.3919, grad_fn=<NllLossBackward>)
tensor(4.3674, grad_fn=<NllLossBackward>)
tensor(2.4469, grad_fn=<NllLossBackward>)
tensor(2.7421, grad_fn=<NllLossBackward>)
tensor(1.5540, grad_fn=<NllLossBackward>)
tensor(3.7748, grad_fn=<NllLossBackward>)
tensor(2.8359, grad_fn=<NllLossBackward>)
tensor(3.6820, grad_fn=<NllLossBackward>)
tensor(2.1182, grad_fn=<NllLossBackward>)
tensor(2.4198, grad_fn=<NllLossBackward>)
tensor(4.3037, grad_fn=<NllLossBackward>)
tensor(4.8833, grad_fn=<NllLossBackward>)
tensor(3.3789, grad_fn=<NllLossBackward>)
tensor(5.2134, grad_fn=<NllLossBackward>)
tensor(2.5472, grad_fn=<NllLossBackward>)
tensor(3.9555, grad_fn=<NllLossBackward>)
tensor(3.6567, grad_fn=<NllLossBackward>)
tensor(4.2599, grad_fn=<NllLossBac

tensor(4.5693, grad_fn=<NllLossBackward>)
tensor(1.8990, grad_fn=<NllLossBackward>)
tensor(7.2491, grad_fn=<NllLossBackward>)
tensor(2.7299, grad_fn=<NllLossBackward>)
tensor(3.8705, grad_fn=<NllLossBackward>)
tensor(4.0509, grad_fn=<NllLossBackward>)
tensor(3.0670, grad_fn=<NllLossBackward>)
tensor(3.6739, grad_fn=<NllLossBackward>)
tensor(4.3420, grad_fn=<NllLossBackward>)
tensor(3.3764, grad_fn=<NllLossBackward>)
tensor(2.7198, grad_fn=<NllLossBackward>)
tensor(2.2546, grad_fn=<NllLossBackward>)
tensor(6.1244, grad_fn=<NllLossBackward>)
tensor(4.8727, grad_fn=<NllLossBackward>)
tensor(4.1101, grad_fn=<NllLossBackward>)
tensor(4.7575, grad_fn=<NllLossBackward>)
tensor(4.4341, grad_fn=<NllLossBackward>)
tensor(2.5794, grad_fn=<NllLossBackward>)
tensor(4.0694, grad_fn=<NllLossBackward>)
tensor(3.6645, grad_fn=<NllLossBackward>)
tensor(4.4746, grad_fn=<NllLossBackward>)
tensor(3.3510, grad_fn=<NllLossBackward>)
tensor(4.7276, grad_fn=<NllLossBackward>)
tensor(5.2122, grad_fn=<NllLossBac