# Pitfalls of Deep Learning and Practical tricks for training

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.datasets as datasets
from torchvision import transforms, io
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader
import torch.optim as optim
from PIL import Image
import copy

In [None]:
torch.manual_seed(42)

## Models, classes and functions
For today, we will use several classes and models, that you can potentially re-use in assigments and your personal projects

In [None]:
# Simple MLP as described in Lecture 4
class Model(nn.Module):
    def __init__(self, input_size, d_1, d_2, output_size):
        super(Model, self).__init__()
        self.fc1 = nn.Linear(input_size, d_1)
        self.fc2 = nn.Linear(d_1, d_2)
        self.fc3 = nn.Linear(d_2, output_size)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        return x

# Class for generating noisy linear data, as described during Lecture 2
def create_dataset(sample_size=10, sigma=0.1, w_star=1, b_star = 1,
                   x_range=(-1, 1), seed=0):
    # Set the random state in numpy
    torch.manual_seed(seed)
    # Unpack the values in x_range
    x_min, x_max = x_range
    # Sample sample_size points from a uniform distribution
    X = torch.rand(sample_size)
    # Rescale between x_min and x_max 
    X = X * (x_max - x_min) + x_min
    # Compute hat(y)
    y_hat = X * w_star + b_star
    # Compute y (Add Gaussian noise)
    y = y_hat + torch.normal(torch.zeros(sample_size), sigma*torch.ones(sample_size))
    return X, y

# Simpler model
class SimpleModel(nn.Module):
    def __init__(self, input_size, d_1, output_size):
        super(SimpleModel, self).__init__()
        self.fc1 = nn.Linear(input_size, d_1)
        self.fc2 = nn.Linear(d_1, output_size)
        
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        return x
    
# Model with regularization
class RegModel(nn.Module):
    def __init__(self, input_size, d_1, d_2, output_size, p=0.2): 
        super(RegModel, self).__init__()
        self.fc1 = nn.Linear(input_size, d_1)
        self.BN = nn.BatchNorm1d(d_1)
        self.fc2 = nn.Linear(d_1, d_2)
        self.dropout = nn.Dropout(p)
        self.fc3 = nn.Linear(d_2, output_size)

             
    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.BN(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x
    
# Template class for Early Stopping
class EarlyStopping():
    def __init__(self, patience=5, min_delta=0, restore_best_weights=True):
        self.patience = patience
        self.min_delta = min_delta # minimum improvement to reset patience
        self.restore_best_weights = restore_best_weights
        self.best_model = None
        self.best_loss = None
        self.counter = 0
        self.status = ""
        
    def __call__(self, model, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
            self.best_model = copy.deepcopy(model)
        elif self.best_loss - val_loss > self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.best_model.load_state_dict(model.state_dict())
        elif self.best_loss - val_loss < self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.status = f"Stopped on {self.counter}"
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model.state_dict())
                return True
        self.status = f"{self.counter}/{self.patience}"
        return False


## Overconfidence in Deep Learning

Let's start with past week model, where we were able to achieve good accuracy values

In [None]:
# Datasets
dataset_train = datasets.MNIST('.', train=True, download=True, transform=transforms.ToTensor()) # download = True just once
dataset_test = datasets.MNIST('.', train=False, download=True, transform=transforms.ToTensor())

dataset_validation, dataset_test = torch.utils.data.random_split(dataset_test, [0.5, 0.5])

# Dataloader
batch_size = 64 # Reduce it in case you need it

train_loader = DataLoader(dataset_train, batch_size=batch_size)
validation_loader = DataLoader(dataset_validation, batch_size=len(dataset_validation))
test_loader = DataLoader(dataset_test, batch_size=len(dataset_test))

# Hyperparameters
d_1 = 100
d_2 = 50
model = Model(28*28, d_1, d_2, 10)

# Hyperparameter!
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'mps' 
    if torch.backends.mps.is_available() else 'cpu')
model = model.to(DEVICE)
print("Working on", DEVICE)

n_epochs = 5
for epoch in range(n_epochs):
    for data, target in train_loader:
        # Set the model in training mode
        model.train()
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        # Set the gradient to 0
        optimizer.zero_grad()
        # Make a prediction
        output = model(data)
        # Compute the loss function
        loss = loss_fn(output, target)
        # Backpropagation
        loss.backward()
        # Update parameters
        optimizer.step()
        
    train_loss = loss.item()
    # At the end of every epoch, check the validation loss value
    with torch.no_grad():
        model.eval()
        for data, target in validation_loader: # Just one batch
            data, target = data.to(DEVICE), target.to(DEVICE)
             # You have to flatten the data!
            data = data.reshape(-1,28*28)
            # Make a prediction
            output = model(data)
            # Compute the loss function
            validation_loss = loss_fn(output, target).item()
            print(f"Epoch {epoch + 1}: Train loss: {train_loss}, Validation loss {validation_loss}")
            
    
# Compute the accuracy on the test set
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for data, target in test_loader:
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += target.size(0)
        n_correct += (predicted == target).sum().item()

    acc = 100.0 * n_correct / n_samples
print("Accuracy on the test set:", acc, "%")


What happens if we try to predict the picture of a Chicken?

In [None]:
# Load the image
image = Image.open('/kaggle/input/practical-tricks-data/chicken_28x28.png')

display(image)

transform = transforms.Compose([
    transforms.PILToTensor()
])

X = transform(image).float().flatten()
print(X.shape)

Now, we want to predict the class of the chicken

We note that our model is overconfidence that is a 8 for no particular reason

In [None]:
X = X.to(DEVICE)
output = model(X) # TODO 
sm = torch.nn.Softmax(dim=0)
probabilities = sm(output) 
print("With probability", 100 * max(probabilities).item(), "% is a ", torch.argmax(probabilities).item())

## Overfitting

In this part of the notebook, we fake an Overfitting case to see what's happening for the loss function. To do so, we generate some random noisy data froma linear function and try to learn them in a non-linear way. We will see that the learnt curve is non linear.

In [None]:
# Parameters we want to learn
w_star = 2
b_star = 1

num_samples_train = 20
num_samples_validation = 10

# Set the seed
seed_train = 0
seed_validation = 1

# Set a value of noise (=sigma)
sigma = 1.3

# Define x_range
x = (-2, 2)

# Generate train data
X_train, y_train = create_dataset(
    sample_size=num_samples_train, sigma=sigma, w_star=w_star,
    b_star = b_star, x_range=x, seed=seed_train)

# Generate the validation data form the same distribution but with a different seed
X_val, y_val = create_dataset(
    sample_size=num_samples_validation, sigma=sigma, w_star=w_star,
    b_star = b_star, x_range=x, seed=seed_validation)

# Reshape data
X_train = X_train.reshape(-1, 1)
y_train = y_train.reshape(-1, 1)
X_val = X_val.reshape(-1, 1)
y_val = y_val.reshape(-1, 1)


# Define a model
model = SimpleModel(1, 400, 1)

# Loss function
loss_fn = nn.MSELoss() 

# Learning rate
learning_rate = 0.01

# Optimizer
optimizer = optim.SGD(model.parameters(), lr=learning_rate)

# Training loop
train_loss_vals = []
val_loss_vals = []
n_steps = 5000 # Number of updates of the gradient
for step in range(n_steps):
    model.train() # Set the model in training mode
    # Set the gradient to 0
    optimizer.zero_grad() # Or model.zero_grad()
    # Compute the output of the model
    y_hat = model(X_train)
    # Compute the loss
    loss = loss_fn(y_hat, y_train)
    # Compute the gradient
    loss.backward()
    # Update the parameters
    optimizer.step()
    # *** Evaluation ***
    # Here we do things that do not contribute to the gradient computation
    model.eval() # Set the model in evaluation mode
    with torch.no_grad(): #
        # Compute the output of the model
        y_hat_val = model(X_val)
        # Compute the loss
        loss_val = loss_fn(y_hat_val, y_val)
        if step % 50 == 0:
            # Print the losses
            print("Step:", step, "Train loss: ", loss.item(), "- Loss eval:", loss_val.item())
    train_loss_vals.append(loss.item())
    val_loss_vals.append(loss_val.item())

Plot the loss functions together... do you notice something strange?

In [None]:
plt.figure()
plt.plot(range(n_steps),train_loss_vals)
plt.plot(range(n_steps),val_loss_vals)
plt.legend(["Tr. loss", "Val. loss"])
plt.show()

Plot the training data alongside with the learned function. Note that we are trying to resemble the data and not generalize them!

In [None]:
plt.figure()
plt.plot(X_train, y_train, 'o')
x_range = torch.linspace(-2, 2, steps=1000)
x_range = x_range.reshape(-1, 1)
plt.plot(x_range, model(x_range).detach().numpy())
plt.show()

## Training with Dropout and Batch Normalization

In this section, we train a model having both Dropout and Batch normalization on the MNIST dataset. Eventually, we will compare its performances on both the test set and the chicken image. Observe that, in this case, the training is slower and you actually need GPUs

In [None]:
# Datasets
dataset_train = datasets.MNIST('.', train=True, download=False, transform=transforms.ToTensor())
dataset_test = datasets.MNIST('.', train=False, download=False, transform=transforms.ToTensor())

dataset_validation, dataset_test = torch.utils.data.random_split(dataset_test, [0.5, 0.5])

# Dataloader
batch_size = 64 # Reduce it in case you need it

train_loader = DataLoader(dataset_train, batch_size=batch_size)
validation_loader = DataLoader(dataset_validation, batch_size=len(dataset_validation))
test_loader = DataLoader(dataset_test, batch_size=len(dataset_test))

# Hyperparameters
d_1 = 100
d_2 = 50
model = RegModel(28*28, d_1, d_2, 10)

# Hyperparameter!
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'mps' 
    if torch.backends.mps.is_available() else 'cpu')
model = model.to(DEVICE)
print("Working on", DEVICE)

n_epochs = 5
for epoch in range(n_epochs):
    for data, target in train_loader:
        # Set the model in training mode
        model.train()
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        # Set the gradient to 0
        optimizer.zero_grad()
        # Make a prediction
        output = model(data)
        # Compute the loss function
        loss = loss_fn(output, target)
        # Backpropagation
        loss.backward()
        # Update parameters
        optimizer.step()
        
    train_loss = loss.item()
    # At the end of every epoch, check the validation loss value
    with torch.no_grad():
        model.eval()
        for data, target in validation_loader: # Just one batch
            data, target = data.to(DEVICE), target.to(DEVICE)
             # You have to flatten the data!
            data = data.reshape(-1,28*28)
            # Make a prediction
            output = model(data)
            # Compute the loss function
            validation_loss = loss_fn(output, target).item()
            print(f"Epoch {epoch + 1}: Train loss: {train_loss}, Validation loss {validation_loss}")
            
    
# Compute the accuracy on the test set
model.eval()
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for data, target in test_loader:
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += target.size(0)
        n_correct += (predicted == target).sum().item()

    acc = 100.0 * n_correct / n_samples
print("Accuracy on the test set:", acc, "%")

## Early stopping

In this section, we will retrain our "Simple" model for several epochs on the MNIST dataset. We will observe the usage of the Early stopping class

In [None]:
# Datasets
dataset_train = datasets.MNIST('.', train=True, download=False, transform=transforms.ToTensor())
dataset_test = datasets.MNIST('.', train=False, download=False, transform=transforms.ToTensor())

dataset_validation, dataset_test = torch.utils.data.random_split(dataset_test, [0.5, 0.5])

# Dataloader
batch_size = 64 # Reduce it in case you need it

train_loader = DataLoader(dataset_train, batch_size=batch_size)
validation_loader = DataLoader(dataset_validation, batch_size=len(dataset_validation))
test_loader = DataLoader(dataset_test, batch_size=len(dataset_test))

# Hyperparameters
d_1 = 100
d_2 = 50
model = Model(28*28, d_1, d_2, 10)

# Hyperparameter!
learning_rate = 0.001

# Optimizer, loss, Early stop
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
early_stop = EarlyStopping(patience=2, min_delta = 0.0001)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'mps' 
    if torch.backends.mps.is_available() else 'cpu')
model = model.to(DEVICE)
print("Working on", DEVICE)

n_epochs = 30
for epoch in range(n_epochs):
    for data, target in train_loader:
        # Set the model in training mode
        model.train()
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        # Set the gradient to 0
        optimizer.zero_grad()
        # Make a prediction
        output = model(data)
        # Compute the loss function
        loss = loss_fn(output, target)
        # Backpropagation
        loss.backward()
        # Update parameters
        optimizer.step()
        
    train_loss = loss.item()
    # At the end of every epoch, check the validation loss value
    model.eval()
    with torch.no_grad():
        for data, target in validation_loader: # Just one batch
            data, target = data.to(DEVICE), target.to(DEVICE)
             # You have to flatten the data!
            data = data.reshape(-1,28*28)
            # Make a prediction
            output = model(data)
            # Compute the loss function
            validation_loss = loss_fn(output, target).item()
            print(f"Epoch {epoch + 1}: Train loss: {train_loss}, Validation loss {validation_loss}")
    
    if early_stop(model, validation_loss):
        print(f"Stopped trained at Epoch {epoch + 1}")
        break

            
    
# Compute the accuracy on the test set
model.eval()
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for data, target in test_loader:
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += target.size(0)
        n_correct += (predicted == target).sum().item()

    acc = 100.0 * n_correct / n_samples
print("Accuracy on the test set:", acc, "%")

## $L_1, L_2$ penalties in the training loop

In this section, we see how to implement an $L_1, L_2$ penalty in the training loop. Note that there is a flag, `reg` that you can either set equal to 1 or 2. It asks you the value of $\lambda$ to be used for the training

In [None]:
# Datasets
dataset_train = datasets.MNIST('.', train=True, download=False, transform=transforms.ToTensor())
dataset_test = datasets.MNIST('.', train=False, download=False, transform=transforms.ToTensor())

dataset_validation, dataset_test = torch.utils.data.random_split(dataset_test, [0.5, 0.5])

# Dataloader
batch_size = 64 # Reduce it in case you need it

train_loader = DataLoader(dataset_train, batch_size=batch_size)
validation_loader = DataLoader(dataset_validation, batch_size=len(dataset_validation))
test_loader = DataLoader(dataset_test, batch_size=len(dataset_test))

# Hyperparameters
d_1 = 100
d_2 = 50
model = Model(28*28, d_1, d_2, 10)

# Hyperparameter!
learning_rate = 0.01

r = input("Insert the regularization method, 1 or 2:")
r = int(r)
l = input("Insert lambda value")
l = float(l)
if r == 2:
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=l)
elif r == 1:
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.)
else:
    raise ValueError("Not an L1/L2 regularization")

loss_fn = nn.CrossEntropyLoss()

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'mps' 
    if torch.backends.mps.is_available() else 'cpu')
model = model.to(DEVICE)
print("Working on", DEVICE)

n_epochs = 5

for epoch in range(n_epochs):
    for data, target in train_loader:
        # Set the model in training mode
        model.train()
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        # Set the gradient to 0
        optimizer.zero_grad()
        # Make a prediction
        output = model(data)
        # Compute the loss function
        loss = loss_fn(output, target)
        
        # In case of L1, add the penalty
        if r == 1:
            l1_norm = sum(p.abs().sum() for p in model.parameters())
            loss += l * l1_norm
        
        # Backpropagation
        loss.backward()
        # Update parameters
        optimizer.step()
    print("Last train loss of the epoch:", loss.item())
            
# Compute the accuracy on the test set
model.eval()
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for data, target in test_loader:
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += target.size(0)
        n_correct += (predicted == target).sum().item()

    acc = 100.0 * n_correct / n_samples
print("Accuracy on the test set:", acc, "%")

## Training with accumulated gradent

In this section, we show with the most simple model we have and we training using accumulated gradient


In [None]:
# Datasets
dataset_train = datasets.MNIST('.', train=True, download=False, transform=transforms.ToTensor())
dataset_test = datasets.MNIST('.', train=False, download=False, transform=transforms.ToTensor())

dataset_validation, dataset_test = torch.utils.data.random_split(dataset_test, [0.5, 0.5])

# Dataloader
batch_size = 64 # Reduce it in case you need it
accum_iter = 4

train_loader = DataLoader(dataset_train, batch_size=batch_size)
validation_loader = DataLoader(dataset_validation, batch_size=len(dataset_validation))
test_loader = DataLoader(dataset_test, batch_size=len(dataset_test))

# Hyperparameters
d_1 = 100
d_2 = 50
model = Model(28*28, d_1, d_2, 10)

# Hyperparameter!
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'mps' 
    if torch.backends.mps.is_available() else 'cpu')
model = model.to(DEVICE)
print("Working on", DEVICE)

n_epochs = 5
for epoch in range(n_epochs):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Set the model in training mode
        model.train()
        # Forward pass
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        # Make a prediction
        output = model(data)
        # Compute the loss function
        loss = loss_fn(output, target)
        
        # Normalize the loss to account for batch normalization 
        # e.g every batch contributes with 1/accum_iter on the whole loss
        loss = loss / accum_iter
        
        # Backpropagation
        loss.backward()
        
        if (batch_idx + 1) % accum_iter == 0: # Every accum_iter iterations....
            # Update parameters
            optimizer.step()
            # Set the gradient to 0
            optimizer.zero_grad()
        
    train_loss = loss.item()
    # At the end of every epoch, check the validation loss value
    with torch.no_grad():
        model.eval()
        for data, target in validation_loader: # Just one batch
            data, target = data.to(DEVICE), target.to(DEVICE)
             # You have to flatten the data!
            data = data.reshape(-1,28*28)
            # Make a prediction
            output = model(data)
            # Compute the loss function
            validation_loss = loss_fn(output, target).item()
            print(f"Epoch {epoch + 1}: Train loss: {train_loss}, Validation loss {validation_loss}")
            
    
# Compute the accuracy on the test set
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for data, target in test_loader:
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += target.size(0)
        n_correct += (predicted == target).sum().item()

    acc = 100.0 * n_correct / n_samples
print("Accuracy on the test set:", acc, "%")


## Training with a scheduler

In this section, we show the usage of a sceduler to reduce the learning rate when the validation loss does not decrease that much

In [None]:
# Datasets
dataset_train = datasets.MNIST('.', train=True, download=True, transform=transforms.ToTensor()) # download = True just once
dataset_test = datasets.MNIST('.', train=False, download=True, transform=transforms.ToTensor())

dataset_validation, dataset_test = torch.utils.data.random_split(dataset_test, [0.5, 0.5])

# Dataloader
batch_size = 64 # Reduce it in case you need it

train_loader = DataLoader(dataset_train, batch_size=batch_size)
validation_loader = DataLoader(dataset_validation, batch_size=len(dataset_validation))
test_loader = DataLoader(dataset_test, batch_size=len(dataset_test))

# Hyperparameters
d_1 = 100
d_2 = 50
model = Model(28*28, d_1, d_2, 10)

# Hyperparameter!
learning_rate = 0.001

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
loss_fn = nn.CrossEntropyLoss()
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)


DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'mps' 
    if torch.backends.mps.is_available() else 'cpu')
model = model.to(DEVICE)
print("Working on", DEVICE)

n_epochs = 10
print("Starting learning rate:", learning_rate)
for epoch in range(n_epochs):
    for data, target in train_loader:
        # Set the model in training mode
        model.train()
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        # Set the gradient to 0
        optimizer.zero_grad()
        # Make a prediction
        output = model(data)
        # Compute the loss function
        loss = loss_fn(output, target)
        # Backpropagation
        loss.backward()
        # Update parameters
        optimizer.step()
        
    train_loss = loss.item()
    # At the end of every epoch, check the validation loss value
    with torch.no_grad():
        model.eval()
        for data, target in validation_loader: # Just one batch
            data, target = data.to(DEVICE), target.to(DEVICE)
             # You have to flatten the data!
            data = data.reshape(-1,28*28)
            # Make a prediction
            output = model(data)
            validation_loss = loss_fn(output, target)
            print(f"Epoch {epoch + 1}: Train loss: {train_loss}, Validation loss {validation_loss}")
            scheduler.step()
            print("New learning rate:", round(scheduler.get_last_lr()[0], 6))
            
    
# Compute the accuracy on the test set
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for data, target in test_loader:
        data, target = data.to(DEVICE), target.to(DEVICE)
        # You have to flatten the data!
        data = data.reshape(-1,28*28)
        outputs = model(data)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += target.size(0)
        n_correct += (predicted == target).sum().item()

    acc = 100.0 * n_correct / n_samples
print("Accuracy on the test set:", acc, "%")