In [1]:
%load_ext autoreload
%autoreload 2

### Google Colab Setup

we need to run a few commands to set up our environment on Google Colab. If you are running this notebook on a local machine you can skip this section. Run the following cell to mount your Google Drive.

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Now recall the path in your Google Drive where you uploaded this notebook, fill it in below.

In [4]:
import os
import sys

# TODO: Fill in the Google Drive path where you uploaded the assignment
# Example: If you create a 'Test' folder and put all the files under 'example' folder, then 'Test/example'
# GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = 'Test/example'
GOOGLE_DRIVE_PATH_AFTER_MYDRIVE = None
GOOGLE_DRIVE_PATH = os.path.join('drive', 'My Drive', GOOGLE_DRIVE_PATH_AFTER_MYDRIVE)
sys.path.append(GOOGLE_DRIVE_PATH)

print(os.listdir(GOOGLE_DRIVE_PATH))

['__pycache__', 'optimizers.py', 'losses.py', 'activations.py', 'models.py', 'data.py', 'utils.py', 'modules.py']


### Setting

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim

# custom utils
import modules
import activations
import losses
import optimizers
import utils
import models
import data

torch.manual_seed(42)
torch.cuda.manual_seed_all(42)

## Multi-Class Support Vector Machin Loss

![](https://cs231n.github.io/assets/margin.jpg)


> The Multiclass Support Vector Machine **"wants"** the score of the correct class to be higher than all other scores by at least a margin of $\Delta$. If any class has a score inside the red region (or higher), then there will be accumulated loss. Otherwise the loss will be zero.






Multi-Class SVM Loss의 수식은 아래와 같다.
$$
\begin{aligned}
L_i & =\sum_{j \neq y_i} \begin{cases}s_j-s_{y_i}+\Delta & \text { if } s_j-s_{y_i}+\Delta \geq 0 \\\\
0 & \text { otherwise }\end{cases} \\\\
& =\sum_{j \neq y_i} \max \left(0, s_j-s_{y_i}+\Delta\right)
\end{aligned}
$$


Multi-Class SVM Loss 의 도함수는 아래와 같다.

$$\begin{equation}
\frac{\partial L_i}{\partial s_k}= \begin{cases} 1\left(s_k-s_{y_i}+\Delta>0\right) & \text { if } k \neq y_i \\\\ -\sum_{j\neq{y_i}} 1\left(s_j-s_{y_i}+\Delta>0\right) & \text { otherwise }\end{cases}
\end{equation}$$

In [6]:
class MulticlassSVMLoss(torch.autograd.Function):

    @staticmethod
    def forward(ctx, scores, y_true):

        # equation 1. calculate loss
        correct_class_scores = scores[range(scores.shape[0]), y_true].unsqueeze(-1)
        margins = scores - correct_class_scores + 1
        margins[range(scores.shape[0]), y_true] = 0

        margins = margins.relu()
        ctx.save_for_backward(scores, y_true, margins)

        loss = margins.sum(dim=1).mean()
        return loss

    @staticmethod
    def backward(ctx, grad_output):
        scores, y_true, margins = ctx.saved_tensors

        # equation 2. calculate derivative
        ds = torch.zeros_like(scores)
        ds[margins > 0] = 1
        ds[range(scores.shape[0]), y_true] -= torch.sum(ds, dim=1)

        ds /= ds.shape[0]
        return ds, None

## Softmax Loss





Softmax Loss의 수식은 아래와 같다.
$$\begin{aligned}
L_i&=-\log\frac{e^{s_{y_i}}}{\sum_j{e^{s_j}}}\\\\
&=-s_{y_i}+\log\sum_j{e^{s_j}}
\end{aligned}$$

Softmax Loss 의 도함수는 아래와 같다.
> $\text{softmax}(z)_i=\frac{e^{z_i}}{\sum_j{e^{z_j}}}$

$$\begin{aligned}
\frac{\partial L_i}{\partial s_k}&=
\begin{cases} \frac{e^{s_{k}}}{\sum_j{e^{s_j}}} & \text { if } k \neq y_i \\\\ -1 + \frac{e^{s_{k}}}{\sum_j{e^{s_j}}} & \text { otherwise }\end{cases}\\\\
&=\begin{cases} \text{softmax}(s)_k & \text { if } k \neq y_i \\\\ \text{softmax}(s)_k-1 & \text { otherwise }\end{cases}
\end{aligned}$$






In [7]:
class SoftmaxLoss(torch.autograd.Function):

    @staticmethod
    def forward(ctx, scores, y_true):

        # equation 1. calculate loss
        scores -= scores.max(dim=1, keepdim=True).values
        log_softmax = scores - scores.logsumexp(dim=1, keepdim=True)
        ctx.save_for_backward(scores, y_true, log_softmax)

        loss = -log_softmax[range(log_softmax.shape[0]), y_true]
        return loss.mean()

    @staticmethod
    def backward(ctx, grad_output):
        scores, y_true, log_softmax = ctx.saved_tensors

        # equation 2. calculate derivative
        ds = log_softmax.exp()
        ds[range(ds.shape[0]), y_true] -= 1
        ds /= ds.shape[0]

        return ds, None

## Cross-Entropy Loss

> **Definition** : $$\begin{equation}
H(P,Q) = -\sum_{y} P(y)\cdot\log{Q(y)}
\end{equation}$$

Classification 문제에서, Cross-Entropy를 적용하게 될 경우,
일반적으로 Q(y)는 softmax function을 사용하여 확률분포생성하며, P(y)는 label의 원핫벡터가 된다. 즉, 정답클랙스일 경우 P(y)는 1이 되며 아닐 경우 0의 값을 가진다. 이를 수식으로 나타내면 아래와 같다.


> $\begin{aligned}
&P(y) = \begin{cases} 0 & \text { if } y \neq y_i \\ 1 & \text { otherwise }\end {cases}\\\\
&Q(y) = \text{softmax}(s)_y\\\\
\end{aligned}$







전체식을 다시 나타내면,

$$\begin{aligned}
L_i &=\sum_{y} P(y)\cdot\text{Softmax Loss}(s)_y\\
&= 1\cdot\text{Softmax Loss}(s)_{y_i}
\end{aligned}$$




In [8]:
class CrossEntropyLoss(torch.autograd.Function):

    @staticmethod
    def forward(ctx, scores, y_true):

        # Calculate Softmax
        scores -= scores.max(dim=1, keepdim=True).values
        log_softmax = scores - scores.logsumexp(dim=1, keepdim=True)
        one_hot_vec = torch.nn.functional.one_hot(y_true, scores.shape[1])

        ctx.save_for_backward(scores, log_softmax, one_hot_vec)

        loss = -(one_hot_vec * log_softmax).sum(dim=1)
        return loss.mean()

    @staticmethod
    def backward(ctx, grad_output):
        scores, log_softmax, one_hot_vec = ctx.saved_tensors

        ds = log_softmax.exp() - one_hot_vec
        ds /= ds.shape[0]

        return ds, None

### Kullback-Leibler divergence Loss



> **Definition** : $$\begin{aligned}
D_{KL}(P||Q) &= H(P,Q)-H(P)\\\\
&= \sum_{y} P(y)\cdot\log{P(y)} - P(y)\cdot\log{Q(y)}\\
&= \sum_{y} P(y)\cdot\log{\frac{P(y)}{Q(y)}}
\end{aligned}$$



In [9]:
class KLdivergenceLoss(torch.autograd.Function):

    @staticmethod
    def forward(ctx, scores, y_true):

        # Calculate Softmax
        scores -= scores.max(dim=1, keepdim=True).values
        log_softmax = scores - scores.logsumexp(dim=1, keepdim=True)


        one_hot_vec = torch.nn.functional.one_hot(y_true, scores.shape[1])
        log_p = one_hot_vec - one_hot_vec.logsumexp(dim=1, keepdim=True)

        ctx.save_for_backward(scores, log_softmax, log_p)

        loss = (one_hot_vec * (log_p - log_softmax)).sum(dim=1)
        return loss.mean()

    @staticmethod
    def backward(ctx, grad_output):
        scores, log_softmax, log_p = ctx.saved_tensors

        ds = log_softmax.exp() - log_p.exp()
        ds /= ds.shape[0]

        return ds, None

## Sanity Check

In [18]:
input_dim = 3 * 32 * 32
num_classes = 10

dset_train, dset_val, dset_test = data.load_cifar10(val_ratio=0.2)

data_loaders = {}
data_loaders['train'] = torch.utils.data.DataLoader(dset_train, batch_size=64, shuffle=True)
data_loaders['val'] = torch.utils.data.DataLoader(dset_val, batch_size=64, shuffle=True)
data_loaders['test'] = torch.utils.data.DataLoader(dset_test, batch_size=64, shuffle=False)


from torchsummary import summary
model = models.MultiLayerNet(input_dim, 128, num_classes)
summary(model, input_size=(input_dim,))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                  [-1, 128]         393,344
              ReLU-2                  [-1, 128]               0
            Linear-3                   [-1, 10]           1,290
Total params: 394,634
Trainable params: 394,634
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.00
Params size (MB): 1.51
Estimated Total Size (MB): 1.52
----------------------------------------------------------------


In [22]:
criterions = {'PyTorch Cross-Entropy Loss': nn.CrossEntropyLoss(),
              'Multi-classSVM Loss' : MulticlassSVMLoss.apply,
              'Softmax Loss' : SoftmaxLoss.apply,
              'CrossEntropy Loss' : CrossEntropyLoss.apply,
              'KLdivergence Loss' : KLdivergenceLoss.apply}


for key, loss_f in criterions.items():
  print(f"{key}\n")
  model = models.MultiLayerNet(input_dim, 128, num_classes)
  optimizer = optim.SGD(model.parameters(), lr=0.001)
  history = utils.runner(model, loss_f, optimizer, data_loaders, num_epochs=10, msg=False)
  print('===========================\n')

PyTorch Cross-Entropy Loss

Train using cpu
Finished Training
Accuracy of the network on the 157 test images: 38.85%

Multi-classSVM Loss

Train using cpu
Finished Training
Accuracy of the network on the 157 test images: 47.58%

Softmax Loss

Train using cpu
Finished Training
Accuracy of the network on the 157 test images: 38.75%

CrossEntropy Loss

Train using cpu
Finished Training
Accuracy of the network on the 157 test images: 38.87%

KLdivergence Loss

Train using cpu
Finished Training
Accuracy of the network on the 157 test images: 31.80%

