#Exercise

a) In this scenaro, we want to use the norm of the mean of the gradients as clipping bound. Please complete the implementation of the clipping function in the code below. 

*Note: By using the norm of the mean of the gradients as clipping bound you should reach an accuracy value above 90% after 10 epochs.*

b) Now, it is your turn to think of a way to ensure privacy during the training of our neural network. The norm of the mean of the gradients introduces some leakage which we want to avoid. How can we get rid of this kind of leakage? Please implement your solution in the code below.

Please make sure to select **GPU** under **Runtime -> Change Runtime Type -> Hardware Accelerator**.

### Imports

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt

from torch.nn.utils import clip_grad_norm_

### Define our model/CNN

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        num_hidden = 512
        num_classes = 10
        self.fc1 = nn.Linear(28*28, num_hidden)
        self.fc2 = nn.Linear(num_hidden, num_classes)

    def forward(self, x):
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def num_flat_features(self, x):
      size = x.size()[1:]
      num_features = 1
      for s in size:
        num_features *= s
      return num_features

### Function to download our datasets

In [None]:
def get_mnist_dataset(root_dir='../data'):
    transform = transforms.Compose([transforms.ToTensor(),
                                    transforms.Normalize((0.1307,), (0.3081,))])
    train_dataset = datasets.MNIST(root_dir, train=True, download=True, transform=transform)
    test_dataset = datasets.MNIST(root_dir, train=False, download=True, transform=transform)

    return train_dataset, test_dataset

In [None]:

##################
#   Exercise a)  #
##################

def clip(gradient, c_bound):
   new_gradient = # TODO #
   return new_gradient


### Define our training function

In [None]:
def train(model, device, train_loader, optimizer, epoch, noise_multiplier):
    model.train()
    criterion = nn.CrossEntropyLoss()
    
    losses = []
    top1_acc = []
    
    # Loop over all batches in our training set
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        # List to store calculated gradients
        grad_list = [ [] for _ in model.parameters() ]
        
        # Loop over all data points in the current batch
        for i in range(len(data)):
            # Add empty dimension to fit expected shape
            x = data[i].unsqueeze_(0)
            y = target[i].unsqueeze_(0)

            # Reset model's current gradients
            optimizer.zero_grad()
            
            # Let model predict label of current data point
            y_hat = model(x)
            
            # Compare prediction to true label, calculate loss
            loss = criterion(y_hat, y)
            pred = torch.argmax(y_hat.detach())
            label = y.detach()

            # Calculate model's accuracy for current data point
            acc1 = torch.eq(pred, label).float()

            # Save performance values
            losses.append(loss.item())
            top1_acc.append(acc1)

            # Perform back propagation / calculate gradients
            loss.backward()

            # Save gradients for current data point in grad_list
            for j, p in enumerate(model.parameters()):
                grad_list[j].append(p.grad.detach().clone())

        # Reset model's current gradients
        optimizer.zero_grad()

        for j, param in enumerate(model.parameters()):
            # Clip length of gradients
            
            ######################################
            # Use norm of mean as clipping bound #
            ######################################
            grad_stack = torch.stack(grad_list[j])
            mean_grad = torch.mean(grad_stack, dim=0)
            max_grad_norm = torch.norm(mean_grad)
            grad_list[j] = [clip(g, max_grad_norm) for g in grad_list[j]]

            # Compute mean of gradients
            grad_stack = torch.stack(grad_list[j])
            mean_grad = torch.mean(grad_stack, dim=0)

            # Create noise
            zeros = torch.zeros(*mean_grad.size()).to(device)
            normal = torch.distributions.Normal(zeros, torch.tensor(noise_multiplier * max_grad_norm).to(device))

            # Sample from noise distribution and add to mean gradient
            param.grad = mean_grad.add(torch.div(normal.sample(), train_loader.batch_size))

        optimizer.step()

    # Calculate average loss and accuracy of current epoch
    avg_loss = torch.mean(torch.tensor(losses)).detach().cpu().numpy()
    avg_acc = torch.mean(torch.tensor(top1_acc)).detach().cpu().numpy()
    
    return avg_loss, avg_acc

### Define our test function

In [None]:
def accuracy(preds, labels):
  return (preds == labels).mean()


def test(model, device, test_loader, epoch):
  model.eval()
  criterion = nn.CrossEntropyLoss()

  losses = []
  top1_acc = []
  
  with torch.no_grad():
    for data, target in test_loader:
      data, target = data.to(device), target.to(device)
      output = model(data)
      test_loss = criterion(output, target)
      preds = np.argmax(output.detach().cpu().numpy(), axis=1)
      labels = target.detach().cpu().numpy()
      acc1 = accuracy(preds, labels)
      
      losses.append(test_loss.item())
      top1_acc.append(acc1)
  
  test_loss = np.mean(losses)
  top1_avg = np.mean(top1_acc)
    
  return test_loss, top1_avg

### Set training parameters

In [None]:
LR = 5e-3
TRAIN_BATCHSIZE = 32
TEST_BATCHSIZE = 1000

EPOCHS = 10

SIGMA = 1.1

### Download dataset, initialize model, create optimizer

In [None]:
cuda_available = torch.cuda.is_available()
device = torch.device("cuda" if cuda_available else "cpu")

train_kwargs = {
    'batch_size': TRAIN_BATCHSIZE
    }

test_kwargs = {
    'batch_size': TEST_BATCHSIZE
    }

if cuda_available:
    cuda_kwargs = {
        'num_workers': 2,
        'pin_memory': True,
        'shuffle': True
        }
    train_kwargs.update(cuda_kwargs)
    test_kwargs.update(cuda_kwargs)

train_dataset, test_dataset = get_mnist_dataset()
train_loader = torch.utils.data.DataLoader(train_dataset, **train_kwargs)
test_loader = torch.utils.data.DataLoader(test_dataset, **test_kwargs)

model = Model().to(device)

optimizer = optim.SGD(model.parameters(), lr=LR)

### Perform training

In [None]:
train_losses = []
test_losses = []
train_accs = []
test_accs = []

for epoch in tqdm(range(1, EPOCHS+1), desc="Epoch", unit="epoch"):
  # Perform training step
  train_loss, train_acc = train( model, device, train_loader, optimizer, epoch, SIGMA)
  
  # Perform test step
  test_loss, test_acc = test(model, device, test_loader, epoch)

  # Print results
  print(f"Epoch: {epoch}, Avg Train Loss: {train_loss:.4f}, Train Acc: {(100. * train_acc):.2f}%")
  print(f"Epoch: {epoch}, Avg Test Loss: {test_loss:.4f}, Test Acc: {(100. * test_acc):.2f}%\n")

  # Save performance progress
  train_losses.append(train_loss)
  test_losses.append(test_loss)
  train_accs.append(train_acc)
  test_accs.append(test_acc)

### Plot results

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
fig.set_size_inches(w=15,h=5)
ax1.plot(train_losses, label="Train Loss")
ax1.plot(test_losses, label="Test Loss")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss")
ax1.legend()
ax2.plot(train_accs, label="Train Accuracy")
ax2.plot(test_accs, label="Test Accuracy")
ax2.set_xlabel("Epoch")
ax2.set_ylabel("Accuracy")
ax2.set_ylim(0, 1)
ax2.legend()