## Label Smoothing - Details
 - Regularization technique
 - Reduce overfitting and overconfidence
 - Formula
     - Replaces one-hot encoding with a mixture of one-hot and the uniform distribution
     
$$
    y_{LS} = (1-\alpha) * y_{hot} + \frac{\alpha}{K}
$$

In [86]:
import torch
import torch.nn.functional as F
from torch.nn.modules.loss import _WeightedLoss

In [87]:
class LabelSmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth_one_hot(targets: torch.Tensor, n_classes: int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = torch.empty(size=(targets.size(0), n_classes),
                                  device=targets.device) \
                .fill_(smoothing / (n_classes - 1)) \
                .scatter_(1, targets.data.unsqueeze(1), 1. - smoothing)
        return targets

    def forward(self, inputs, targets):
        targets = LabelSmoothCrossEntropyLoss._smooth_one_hot(targets, inputs.size(-1),
                                                              self.smoothing)
        lsm = F.log_softmax(inputs, -1)

        if self.weight is not None:
            lsm = lsm * self.weight.unsqueeze(0)

        loss = -(targets * lsm).sum(-1)

        if self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()

        return loss

In [40]:
loss_function = LabelSmoothCrossEntropyLoss()

In [45]:
inputs = torch.randn(3, 5, requires_grad=True)
targets = torch.empty(3, dtype=torch.long).random_(5)
inputs, targets

(tensor([[-0.6981,  0.9529, -0.0807, -0.5213, -0.0753],
         [ 0.8347, -1.4822, -0.0525, -0.7690,  0.4886],
         [-1.1051,  0.3915,  0.5995,  1.0561,  0.7371]], requires_grad=True),
 tensor([1, 1, 3]))

In [32]:
SMOOTH = 0.05

In [35]:
FILL_VALUE = SMOOTH/(inputs.size(-1)-1)
FILL_VALUE

0.0125

In [36]:
torch.empty(inputs.size()).fill_(FILL_VALUE)

tensor([[0.0125, 0.0125, 0.0125, 0.0125, 0.0125],
        [0.0125, 0.0125, 0.0125, 0.0125, 0.0125],
        [0.0125, 0.0125, 0.0125, 0.0125, 0.0125]])

In [37]:
torch.empty(inputs.size()).fill_(FILL_VALUE).scatter_(1,targets.data.unsqueeze(1),1. - SMOOTH)

tensor([[0.0125, 0.0125, 0.9500, 0.0125, 0.0125],
        [0.9500, 0.0125, 0.0125, 0.0125, 0.0125],
        [0.0125, 0.0125, 0.0125, 0.0125, 0.9500]])

In [52]:
smooth_targets = loss_function._smooth_one_hot(targets, inputs.size(-1), smoothing=0.05)
smooth_targets

tensor([[0.0125, 0.9500, 0.0125, 0.0125, 0.0125],
        [0.0125, 0.9500, 0.0125, 0.0125, 0.0125],
        [0.0125, 0.0125, 0.0125, 0.9500, 0.0125]])

In [66]:
lsm = F.log_softmax(inputs, -1)
lsm

tensor([[-2.4091, -0.7581, -1.7917, -2.2322, -1.7863],
        [-0.8833, -3.2003, -1.7706, -2.4870, -1.2295],
        [-3.2565, -1.7599, -1.5519, -1.0953, -1.4142]],
       grad_fn=<LogSoftmaxBackward>)

In [67]:
F.softmax(inputs,-1)

tensor([[0.0899, 0.4686, 0.1667, 0.1073, 0.1676],
        [0.4134, 0.0408, 0.1702, 0.0832, 0.2925],
        [0.0385, 0.1721, 0.2118, 0.3345, 0.2431]], grad_fn=<SoftmaxBackward>)

In [68]:
torch.log(F.softmax(inputs,-1))

tensor([[-2.4091, -0.7581, -1.7917, -2.2322, -1.7863],
        [-0.8833, -3.2003, -1.7706, -2.4870, -1.2295],
        [-3.2565, -1.7599, -1.5519, -1.0953, -1.4142]], grad_fn=<LogBackward>)

In [73]:
-(smooth_targets*torch.log(F.softmax(inputs,-1))).sum(-1)

tensor([0.8229, 3.1199, 1.1403], grad_fn=<NegBackward>)

In [84]:
-(smooth_targets*torch.log(F.softmax(inputs,-1))).sum(-1).sum()

tensor(5.0831, grad_fn=<NegBackward>)

In [85]:
-(smooth_targets*torch.log(F.softmax(inputs,-1))).sum(-1).mean()

tensor(1.6944, grad_fn=<NegBackward>)

#### References

1. https://stackoverflow.com/questions/55681502/label-smoothing-in-pytorch  
2. https://github.com/NingAnMe/Label-Smoothing-for-CrossEntropyLoss-PyTorch