
**Install requirements**

In [0]:
!pip3 install 'torch==1.3.1'
!pip3 install 'torchvision==0.5.0'
!pip3 install 'Pillow-SIMD'
!pip3 install 'tqdm'

**Import libraries**

In [0]:
import os
import logging

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torch.backends import cudnn

import torchvision
from torchvision import transforms

from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import ParameterGrid


FIG_SIZE = (8, 5)
RANDOM_STATE = 42

In [0]:
def plot_stat(t, labelt = 'Train', v = None, labelv = 'Val'):
    plt.figure(figsize=FIG_SIZE)
    plt.plot(range(len(t)),t, marker='o', label=labelt)
    if v is not None:
      plt.plot(range(len(t)),v,marker='o', label=labelv)

    plt.title("Loss on for each Epoch")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.show()

In [0]:
from torch.autograd import Function

class ReverseLayerF(Function):
    # Forwards identity
    # Sends backward reversed gradients
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha

        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None


In [0]:
from torch.hub import load_state_dict_from_url

__all__ = ['AlexNet', 'alexnet']


model_urls = {
    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
}


class AlexNet(nn.Module):

    def __init__(self, num_classes=1000):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )
        self.dann = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x, alpha=None):
        features = self.features(x)
        features = self.avgpool(features)
        # Flatten the features:
        features = torch.flatten(features, 1)
        # If we pass alpha, we can assume we are training the discriminator
        if alpha is not None:
            # gradient reversal layer (backward gradients will be reversed)
            reverse_feature = ReverseLayerF.apply(features, alpha)
            discriminator_output = self.dann(reverse_feature)
            return discriminator_output
        # If we don't pass alpha, we assume we are training with supervision
        else:
            # do something else
            class_outputs = self.classifier(features)
            return class_outputs


def alexnet(pretrained=False, progress=True, **kwargs):
    r"""AlexNet model architecture from the
    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    model = AlexNet(**kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls['alexnet'],
                                              progress=progress)
        model.load_state_dict(state_dict, strict=False)
        
        model.dann[1].weight.data = model.classifier[1].weight.data
        model.dann[1].bias.data = model.classifier[1].bias.data
        model.dann[4].weight.data = model.classifier[4].weight.data
        model.dann[4].bias.data = model.classifier[4].bias.data
    return model

**Set Arguments**

In [0]:
DEVICE = 'cuda' # 'cuda' or 'cpu'

NUM_CLASSES = 7 

BATCH_SIZE = 256     # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results

LR = 1e-3            # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 5e-5  # Regularization, you can keep this at the default

NUM_EPOCHS = 30      # Total number of training epochs (iterations over dataset)
STEP_SIZE = 20       # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.1          # Multiplicative factor for learning rate step-down

LOG_FREQUENCY = 10

DANN = True
ALPHA = 0.1

**Define Data Preprocessing**

In [0]:
# Define transforms for training phase
train_transform = transforms.Compose([transforms.Resize(256),      # Resizes short size of the PIL image to 256
                                      transforms.CenterCrop(224),  # Crops a central square patch of the image
                                                                   # 224 because torchvision's AlexNet needs a 224x224 input!
                                                                   # Remember this when applying different transformations, otherwise you get an error
                                      transforms.RandomCrop( 64 , padding =2) ,
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) # Normalizes tensor with mean and standard deviation
])
# Define transforms for the evaluation phase
eval_transform = transforms.Compose([transforms.Resize(256),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))                                    
])

**Prepare Dataset**

In [0]:
# Clone github repository with data
if not os.path.isdir('./Homework3-PACS'):
  !git clone https://github.com/MachineLearning2020/Homework3-PACS.git

DATA_DIR = 'Homework3-PACS/PACS'

TRAIN_DATA_DIR = DATA_DIR + '/photo'
TEST_DATA_DIR = DATA_DIR + '/art_painting'

CARTOON_DATA_DIR =  DATA_DIR + '/cartoon'
SKETCH_DATA_DIR = DATA_DIR + '/sketch'

# Prepare Pytorch train/test Datasets
train_dataset = torchvision.datasets.ImageFolder(TRAIN_DATA_DIR, transform=train_transform)
target_dataset = torchvision.datasets.ImageFolder(TEST_DATA_DIR, transform=train_transform)
test_dataset = torchvision.datasets.ImageFolder(TEST_DATA_DIR, transform=eval_transform)
cartoon_dataset = torchvision.datasets.ImageFolder(CARTOON_DATA_DIR, transform=eval_transform)
sketch_dataset = torchvision.datasets.ImageFolder(SKETCH_DATA_DIR, transform=eval_transform)

train_indexes = [idx for idx in range(len(train_dataset)) if idx % 5]
target_indexes = [idx for idx in range(len(target_dataset)) if not idx % 5]
test_indexes = [idx for idx in range(len(test_dataset)) if not idx % 5]
cartoon_indexes = [idx for idx in range(len(cartoon_dataset)) if not idx % 5]
sketch_indexes = [idx for idx in range(len(sketch_dataset)) if not idx % 5]

train_dataset = Subset(train_dataset, train_indexes)
target_dataset = Subset(target_dataset, target_indexes)
test_dataset = Subset(test_dataset, test_indexes)
cartoon_dataset = Subset(cartoon_dataset, cartoon_indexes)
sketch_dataset = Subset(sketch_dataset, sketch_indexes)

# Check dataset sizes
print('Train Dataset: {}'.format(len(train_dataset)))
print('Test Dataset: {}'.format(len(test_dataset)))
print('Cartoon Dataset: {}'.format(len(cartoon_dataset)))
print('Sketch Dataset: {}'.format(len(sketch_dataset)))

**Prepare Dataloaders**

In [0]:
# Dataloaders iterate over pytorch datasets and transparently provide useful functions (e.g. parallelization and shuffling)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True)
target_dataloader = DataLoader(target_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
cartoon_dataloader = DataLoader(cartoon_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)
sketch_dataloader = DataLoader(sketch_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

**Prepare Network**

In [0]:
net = alexnet(pretrained=True) # Loading AlexNet model

# AlexNet has 1000 output neurons, corresponding to the 1000 ImageNet's classes

net.classifier[6] = nn.Linear(4096, NUM_CLASSES) # nn.Linear in pytorch is a fully connected layer
                                                 # The convolutional layer is nn.Conv2d
net.dann[6] = nn.Linear(4096,2) 

# It is mandatory to study torchvision.models.alexnet source code



**Prepare Training**

In [0]:
# Define loss function
criterion = nn.CrossEntropyLoss() # for classification, we use Cross Entropy

# Choose parameters to optimize
# To access a different set of parameters, you have to access submodules of AlexNet
# (nn.Module objects, like AlexNet, implement the Composite Pattern)
# e.g.: parameters of the fully connected layers: net.classifier.parameters()
# e.g.: parameters of the convolutional layers: look at alexnet's source code ;) 
parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet

# Define optimizer
# An optimizer updates the weights based on loss
# We use SGD with momentum
optimizer = optim.SGD(parameters_to_optimize, lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)

# Define scheduler
# A scheduler dynamically changes learning rate
# The most common schedule is the step(-down), which multiplies learning rate by gamma every STEP_SIZE epochs
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)

**Train**

In [0]:
# By default, everything is loaded to cpu
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
cudnn.benchmark # Calling this optimizes runtime
 
loss_train = []
loss_dann = []
loss_alex = []
current_step = 0

# Start iterating over the epochs
for epoch in range(NUM_EPOCHS):
  print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_last_lr()))
  losses_net = []
  losses_alex = []
  losses_dann = []

  
  # Iterate over the dataset
  for images, labels in train_dataloader:
    # Bring data over the device of choice
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)

    loss_tmp = 0
    loss_alex_tmp = 0
    loss_dann_tmp = 0

    net.train() # Sets module in training mode

    # PyTorch, by default, accumulates gradients after each backward pass
    # We need to manually set the gradients to zero before starting a new iteration
    optimizer.zero_grad() # Zero-ing the gradients

### FIRST STEP ###

    # Forward pass to the network
    outputs = net(images)

    # Compute loss based on output and ground truth
    loss = criterion(outputs, labels)
    loss_alex_tmp+=loss.item()
    loss_tmp+=loss.item()

    loss.backward()  # backward pass: computes gradients

    if DANN:
### SECOND STEP ###

      # Forward pass to the network
      outputs = net(images, alpha = ALPHA)

      # Compute loss based on output and ground truth
      loss = criterion(outputs, torch.zeros(labels.size(0), dtype = torch.int64).to(DEVICE))
      loss_tmp+=loss.item()
      loss_dann_tmp+=loss.item()

      loss.backward()  # backward pass: computes gradients

  ### THIRD STEP ###
      images, _ = next(iter(target_dataloader))
      images = images.to(DEVICE)
      labels = labels.to(DEVICE)

      # Forward pass to the network
      outputs = net(images, alpha = ALPHA)

      # Compute loss based on output and ground truth
      loss = criterion(outputs, torch.ones(images.size(0), dtype = torch.int64).to(DEVICE))
      loss_tmp+=loss.item()
      loss_dann_tmp+=loss.item()

      loss.backward()  # backward pass: computes gradients

    # Log loss
    if current_step % LOG_FREQUENCY == 0:
      print('Step {}, Loss {}'.format(current_step, loss_tmp))

    # Compute gradients for each layer and update weights
    optimizer.step() # update weights based on accumulated gradients

    losses_alex.append(loss_alex_tmp)
    losses_net.append(loss_tmp)
    losses_dann.append(loss_dann_tmp)

    current_step += 1

  loss_alex.append(sum(losses_alex)/len(losses_alex))
  loss_dann.append(sum(losses_dann)/len(losses_dann))
  loss_train.append(sum(losses_net)/len(losses_net))

  # Step the scheduler
  scheduler.step() 

In [0]:
plot_stat(loss_train)
plot_stat(loss_alex,'AlexNet',loss_dann,'DANN')

**Test**

In [0]:
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
net.train(False) # Set Network to evaluation mode

running_corrects = 0
for images, labels in tqdm(test_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  # Forward Pass
  outputs = net(images)

  # Get predictions
  _, preds = torch.max(outputs.data, 1)

  # Update Corrects
  running_corrects += torch.sum(preds == labels.data).data.item()

# Calculate Accuracy
accuracy = running_corrects / float(len(test_dataset))

print('Test Accuracy: {}'.format(accuracy))

**Grid Seach**

In [0]:
hyperparameters = {'NUM_EPOCHS':[25,30], 
                  'LR':[1e-3,1e-2],
                   'STEP_SIZE':[10,20],
                   'ALPHA':[0.5,0.1,0.05]}

best_acc = 0

for config in ParameterGrid(hyperparameters): 
  print('Hyper: epoch {}, LR = {}, STEP_SIZE = {}, ALPHA = {}'.format(config['NUM_EPOCHS'], config['LR'], config['STEP_SIZE'], config['ALPHA'],))
  del net
  # Loading AlexNet model
  net = alexnet(pretrained=True)
  net.classifier[6] = nn.Linear(4096, NUM_CLASSES) 
  net.dann[6] = nn.Linear(4096,2)  

  # Choose parameters to optimize
  parameters_to_optimize = net.parameters()
  # Define optimizer
  optimizer = optim.SGD(parameters_to_optimize, lr=config['LR'], momentum=MOMENTUM, weight_decay=WEIGHT_DECAY) 
  # Define scheduler
  scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=config['STEP_SIZE'], gamma=GAMMA)

  # By default, everything is loaded to cpu
  net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
  cudnn.benchmark # Calling this optimizes runtime
  
  loss_train = []
  loss_dann = []
  loss_alex = []
  accuracy = []
  current_step = 0

  # Start iterating over the epochs
  for epoch in range(config['NUM_EPOCHS']):
    losses_net = []
    losses_alex = []
    losses_dann = []
    
    # Iterate over the dataset
    for images, labels in train_dataloader:
      # Bring data over the device of choice
      images = images.to(DEVICE)
      labels = labels.to(DEVICE)

      loss_tmp = 0
      loss_alex_tmp = 0
      loss_dann_tmp = 0

      net.train() # Sets module in training mode

      # PyTorch, by default, accumulates gradients after each backward pass
      # We need to manually set the gradients to zero before starting a new iteration
      optimizer.zero_grad() # Zero-ing the gradients

  ### FIRST STEP ###

      # Forward pass to the network
      outputs = net(images)

      # Compute loss based on output and ground truth
      loss = criterion(outputs, labels)
      loss_alex_tmp+=loss.item()
      loss_tmp+=loss.item()

      loss.backward()  # backward pass: computes gradients

      if DANN:
  ### SECOND STEP ###

        # Forward pass to the network
        outputs = net(images, alpha = config['ALPHA'])

        # Compute loss based on output and ground truth
        loss = criterion(outputs, torch.zeros(labels.size(0), dtype = torch.int64).to(DEVICE))
        loss_tmp+=loss.item()
        loss_dann_tmp+=loss.item()

        loss.backward()  # backward pass: computes gradients

    ### THIRD STEP ###
        images, _ = next(iter(target_dataloader))
        images = images.to(DEVICE)

        # Forward pass to the network
        outputs = net(images, alpha = config['ALPHA'])

        # Compute loss based on output and ground truth
        loss = criterion(outputs, torch.ones(images.size(0), dtype = torch.int64).to(DEVICE))
        loss_tmp+=loss.item()
        loss_dann_tmp+=loss.item()

        loss.backward()  # backward pass: computes gradients

      # Compute gradients for each layer and update weights
      optimizer.step() # update weights based on accumulated gradients

      losses_alex.append(loss_alex_tmp)
      losses_net.append(loss_tmp)
      losses_dann.append(loss_dann_tmp)

      current_step += 1

    loss_alex.append(sum(losses_alex)/len(losses_alex))
    loss_dann.append(sum(losses_dann)/len(losses_dann))
    loss_train.append(sum(losses_net)/len(losses_net))

    # Step the scheduler
    scheduler.step() 

  ### VALIDATION MODEL ###

    net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
    net.train(False) # Set Network to evaluation mode

    running_corrects = 0
    for images, labels in cartoon_dataloader:
      images = images.to(DEVICE)
      labels = labels.to(DEVICE)

      # Forward Pass
      outputs = net(images)

      # Get predictions
      _, preds = torch.max(outputs.data, 1)

      # Update Corrects
      running_corrects += torch.sum(preds == labels.data).data.item()

    # Calculate Accuracy
    accuracy_cartoon = running_corrects / float(len(cartoon_dataset))

    running_corrects = 0
    for images, labels in sketch_dataloader:
      images = images.to(DEVICE)
      labels = labels.to(DEVICE)

      # Forward Pass
      outputs = net(images)

      # Get predictions
      _, preds = torch.max(outputs.data, 1)

      # Update Corrects
      running_corrects += torch.sum(preds == labels.data).data.item()

    # Calculate Accuracy
    accuracy_sketch = running_corrects / float(len(sketch_dataset))

    accuracy.append((accuracy_cartoon + accuracy_sketch)/2)


  avg_accuracy = sum(accuracy)/len(accuracy)

  if avg_accuracy > best_acc:
      best_acc = avg_accuracy
      best_config = config
      print('Found best average accuracy on val sets = {}'.format(best_acc,epoch))
      best_loss_net = loss_train
      best_loss_dann = loss_dann
      best_loss_alex = loss_alex


In [0]:
print(best_config)
plot_stat(best_loss_alex,'Alex',best_loss_dann,'DANN')
plot_stat(best_loss_net,'Net')