# Introduction

I am beginner in kaggle and this is my first notebook so I would appreciate it very much if you could point out something wrong and rude if exists.

* In this notebook I share source code of bi-tempered logistic loss and taylor cross entropy loss for keras/TF
* I also demonstrate how these loss functions act
* I modified previous source code of bi-tempered loss function; I separate it into tempered softmax activation and tempered loss function

# Reference & Acknowledgements

* Notebook
 *  https://www.kaggle.com/capiru/cassavanet-starter-easy-gpu-tpu-cv-0-9
* Discussion
 * https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/209773
 * https://www.kaggle.com/c/cassava-leaf-disease-classification/discussion/209782
* Paper
 * https://www.ijcai.org/Proceedings/2020/0305.pdf
 * https://arxiv.org/pdf/1906.03361.pdf
* Github
 * https://github.com/google/bi-tempered-loss
 * https://github.com/Diulhio/bitemperedloss-tf

# Source code

## Import libraries

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.activations import softmax
from tensorflow.keras.losses import CategoricalCrossentropy

## Tempered Softmax Activation

* Bi-tempered logistic loss is one of solution to manage noisy labels.

* Previous code are integrated with tempered softmax function and tempered loss function but I want separate into two function.

* So I separate Bi-tempered logistic loss function into two classes; TemperedSoftmax and BiTemperedLogisticLoss

In [None]:
# Tempered Softmax Activation

def log_t(u, t):
    epsilon = 1e-7
    """Compute log_t for `u`."""
    if t == 1.0:
        return tf.math.log(u + epsilon)
    else:
        return (u**(1.0 - t) - 1.0) / (1.0 - t)

def exp_t(u, t):
    """Compute exp_t for `u`."""
    if t == 1.0:
        return tf.math.exp(u)
    else:
        return tf.math.maximum(0.0, 1.0 + (1.0 - t) * u) ** (1.0 / (1.0 - t))

def compute_normalization_fixed_point(y_pred, t2, num_iters=5):
    """Returns the normalization value for each example (t > 1.0).
    Args:
    y_pred: A multi-dimensional tensor with last dimension `num_classes`.
    t2: A temperature 2 (> 1.0 for tail heaviness).
    num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as y_pred with the last dimension being 1.
    """
    mu = tf.math.reduce_max(y_pred, -1, keepdims=True)
    normalized_y_pred_step_0 = y_pred - mu
    normalized_y_pred = normalized_y_pred_step_0
    i = 0
    while i < num_iters:
        i += 1
        logt_partition = tf.math.reduce_sum(exp_t(normalized_y_pred, t2),-1, keepdims=True)
        normalized_y_pred = normalized_y_pred_step_0 * (logt_partition ** (1.0 - t2))
  
    logt_partition = tf.math.reduce_sum(exp_t(normalized_y_pred, t2), -1, keepdims=True)
    return -log_t(1.0 / logt_partition, t2) + mu

def compute_normalization(y_pred, t2, num_iters=5):
    """Returns the normalization value for each example.
    Args:
    y_pred: A multi-dimensional tensor with last dimension `num_classes`.
    t2: A temperature 2 (< 1.0 for finite support, > 1.0 for tail heaviness).
    num_iters: Number of iterations to run the method.
    Return: A tensor of same rank as activation with the last dimension being 1.
    """
    if t2 < 1.0:
        return None # not implemented as these values do not occur in the authors experiments...
    else:
        return compute_normalization_fixed_point(y_pred, t2, num_iters)

def tempered_softmax_activation(x, t2=1., num_iters=5):
    """Tempered softmax function.
    Args:
    x: A multi-dimensional tensor with last dimension `num_classes`.
    t2: A temperature tensor > 0.0.
    num_iters: Number of iterations to run the method.
    Returns:
    A probabilities tensor.
    """
    if t2 == 1.0:
        normalization_constants = tf.math.log(tf.math.reduce_sum(tf.math.exp(x), -1, keepdims=True))
    else:
        normalization_constants = compute_normalization(x, t2, num_iters)

    return exp_t(x - normalization_constants, t2)

class TemperedSoftmax(tf.keras.layers.Layer):
    def __init__(self, t2=1.0, num_iters=5, **kwargs):
        super(TemperedSoftmax, self).__init__(**kwargs)
        self.t2 = t2
        self.num_iters = num_iters

    def call(self, inputs):
        return tempered_softmax_activation(inputs, t2=self.t2, num_iters=self.num_iters)


## Tempered Logistic Loss

In [None]:
# Bi Tempered Logistic Loss

def bi_tempered_logistic_loss(y_pred, y_true, t1, label_smoothing=0.0):
    """Bi-Tempered Logistic Loss with custom gradient.
    Args:
    y_pred: A multi-dimensional probability tensor with last dimension `num_classes`.
    y_true: A tensor with shape and dtype as y_pred.
    t1: Temperature 1 (< 1.0 for boundedness).
    label_smoothing: A float in [0, 1] for label smoothing.
    Returns:
    A loss tensor.
    """
    y_pred = tf.cast(y_pred, tf.float32)
    y_true = tf.cast(y_true, tf.float32)

    if label_smoothing > 0.0:
        num_classes = tf.cast(tf.shape(y_true)[-1], tf.float32)
        y_true = (1 - num_classes /(num_classes - 1) * label_smoothing) * y_true + label_smoothing / (num_classes - 1)

    temp1 = (log_t(y_true + 1e-7, t1) - log_t(y_pred, t1)) * y_true
    temp2 = (1 / (2 - t1)) * (tf.math.pow(y_true, 2 - t1) - tf.math.pow(y_pred, 2 - t1))
    loss_values = temp1 - temp2

    return tf.math.reduce_sum(loss_values, -1)

class BiTemperedLogisticLoss(tf.keras.losses.Loss):
    def __init__(self, t1, label_smoothing=0.0):
        super(BiTemperedLogisticLoss, self).__init__()
        self.t1 = t1
        self.label_smoothing = label_smoothing

    def call(self, y_true, y_pred):
        return bi_tempered_logistic_loss(y_pred, y_true, self.t1, self.label_smoothing)

## Taylor Cross Entropy Loss

* Taylor cross entropy loss is also a candidate for managing noisy labels
* But no library is available for keras/TF
* So I develop a taylor corss entropy loss code greatly referring to the original paper

In [None]:
# Taylor cross entropy loss
def taylor_cross_entropy_loss(y_pred, y_true, n=3, label_smoothing=0.0):
    """Taylor Cross Entropy Loss.
    Args:
    y_pred: A multi-dimensional probability tensor with last dimension `num_classes`.
    y_true: A tensor with shape and dtype as y_pred.
    n: An order of taylor expansion.
    label_smoothing: A float in [0, 1] for label smoothing.
    Returns:
    A loss tensor.
    """
    y_pred = tf.cast(y_pred, tf.float32)
    y_true = tf.cast(y_true, tf.float32)

    if label_smoothing > 0.0:
        num_classes = tf.cast(tf.shape(y_true)[-1], tf.float32)
        y_true = (1 - num_classes /(num_classes - 1) * label_smoothing) * y_true + label_smoothing / (num_classes - 1)
    
    y_pred_n_order = tf.math.maximum(tf.stack([1 - y_pred] * n), 1e-7) # avoide being too small value
    numerator = tf.math.maximum(tf.math.cumprod(y_pred_n_order, axis=0), 1e-7) # avoide being too small value
    denominator = tf.expand_dims(tf.expand_dims(tf.range(1, n+1, dtype="float32"), axis=1), axis=1)
    y_pred_taylor = tf.math.maximum(tf.math.reduce_sum(tf.math.divide(numerator, denominator), axis=0), 1e-7) # avoide being too small value
    loss_values = tf.math.reduce_sum(y_true * y_pred_taylor, axis=1, keepdims=True)
    return tf.math.reduce_sum(loss_values, -1)

class TaylorCrossEntropyLoss(tf.keras.losses.Loss):
    def __init__(self, n=3, label_smoothing=0.0):
        super(TaylorCrossEntropyLoss, self).__init__()
        self.n = n
        self.label_smoothing = label_smoothing
    
    def call(self, y_true, y_pred):
        return taylor_cross_entropy_loss(y_pred, y_true, n=self.n, label_smoothing=self.label_smoothing)


# Experiment

### Bi-tempered logistic loss

* Bi-tempered logistic loss has a parameter t1
* The closer t1 is to 1, the closer outputs is to that of categorical cross entropy

### Taylor cross entropy

* Taylor corss entropy has a parameter n which is an order of taylor expantion
* The larger n is (it means developing high-order), the closer outputs are to that of categorical cross entropy

In [None]:
y_pred = [[0.00001, 0.00002, 0.00003, 0.99994],
          [0.99999999, 0.000000003, 0.000000001, 0.000000001],
          [0.999950, 0.00003, 0.00001, 0.00001],
          [0.999950, 0.00003, 0., 0.00002],
          [8.67612648e-09, 3.44215927e-08, 5.46933476e-09, 1],
          [0.25, 0.25, 0.25, 0.25]]

y_true = [[0., 0., 0., 1.],
          [1., 0., 0., 0.],
          [0., 1., 0., 0.],
          [1., 0., 0., 0.],
          [0., 0., 0., 1.],
          [0., 1., 0., 0.]]

ccel = CategoricalCrossentropy()
btll_02 = BiTemperedLogisticLoss(t1=0.2)
btll_08 = BiTemperedLogisticLoss(t1=0.8)
btll_0999 = BiTemperedLogisticLoss(t1=0.999)
btll_1 = BiTemperedLogisticLoss(t1=1)
tcel_3 = TaylorCrossEntropyLoss(n=3)
tcel_30 = TaylorCrossEntropyLoss(n=30)
tcel_30000 = TaylorCrossEntropyLoss(n=30000)


print("Categorical cross entropy: %s" % ccel(y_true, y_pred).numpy())
print("Bi-tempered logistic loss (t1=0.2): %s" % btll_02(y_true, y_pred).numpy())
print("Bi-tempered logistic loss (t1=0.8): %s" % btll_08(y_true, y_pred).numpy())
print("Bi-tempered logistic loss (t1=0.999): %s" % btll_0999(y_true, y_pred).numpy())
print("Bi-tempered logistic loss (t1=1.0): %s" % btll_1(y_true, y_pred).numpy())
print("Taylor cross entropy loss (n=3): %s" % tcel_3(y_true, y_pred).numpy())
print("Taylor cross entropy loss (n=30): %s" % tcel_30(y_true, y_pred).numpy())
print("Taylor cross entropy loss (n=30000): %s" % tcel_30000(y_true, y_pred).numpy())

### With label smoothing

* I also investigate the effect of label smoothing

In [None]:
label_smoothing = 0.1
smoothed_ccel = CategoricalCrossentropy(label_smoothing=label_smoothing)
smoothed_btll_02 = BiTemperedLogisticLoss(t1=0.2, label_smoothing=label_smoothing)
smoothed_btll_08 = BiTemperedLogisticLoss(t1=0.8, label_smoothing=label_smoothing)
smoothed_btll_0999 = BiTemperedLogisticLoss(t1=0.999, label_smoothing=label_smoothing)
smoothed_btll_1 = BiTemperedLogisticLoss(t1=1, label_smoothing=label_smoothing)
smoothed_tcel_3 = TaylorCrossEntropyLoss(n=3, label_smoothing=label_smoothing)
smoothed_tcel_30 = TaylorCrossEntropyLoss(n=30, label_smoothing=label_smoothing)
smoothed_tcel_30000 = TaylorCrossEntropyLoss(n=30000, label_smoothing=label_smoothing)


print("Smoothed categorical cross entropy: %s" % smoothed_ccel(y_true, y_pred).numpy())
print("Smoothed bi-tempered logistic loss (t1=0.2): %s" % smoothed_btll_02(y_true, y_pred).numpy())
print("Smoothed bi-tempered logistic loss (t1=0.8): %s" % smoothed_btll_08(y_true, y_pred).numpy())
print("Smoothed bi-tempered logistic loss (t1=0.999): %s" % smoothed_btll_0999(y_true, y_pred).numpy())
print("Smoothed bi-tempered logistic loss (t1=1.0): %s" % smoothed_btll_1(y_true, y_pred).numpy())
print("Smoothed taylor cross entropy loss (n=3): %s" % smoothed_tcel_3(y_true, y_pred).numpy())
print("Smoothed taylor cross entropy loss (n=30): %s" % smoothed_tcel_30(y_true, y_pred).numpy())
print("Smoothed taylor cross entropy loss (n=30000): %s" % smoothed_tcel_30000(y_true, y_pred).numpy())

### Tempered Softmax Activation

* Tempered softmax activation has a parameter t2
* The closer t2 is to 1, the closer outputs are to that of simple softmax activation

In [None]:
activation = [[0.00, 0.003, 0.002, 5.],
              [8.1, 0.003, 0.01, 0.0003],
              [2.0, 0.005, 0.006, 0.0001],
              [6.0, 0.01, 0.001, 0.001],
              [10., 0.0002, 0.002, 0.3],
              [5.3, 0.001, 0.4, 0.3]]
activation_tf = tf.cast(activation, tf.float32)

tempered_softmax_2 = TemperedSoftmax(t2=2)
tempered_softmax_4 = TemperedSoftmax(t2=4)
tempered_softmax_1001 = TemperedSoftmax(t2=1.001)

y_pred_softmax = softmax(activation_tf)
y_pred_tempered_softmax_2 = tempered_softmax_2(activation_tf)
y_pred_tempered_softmax_4 = tempered_softmax_4(activation_tf)
y_pred_tempered_softmax_1001 = tempered_softmax_1001(activation_tf)
print("The softmax reult")
print(y_pred_softmax)
print("")
print("The tempered softmax reult (t2=2)")
print(y_pred_tempered_softmax_2)
print("")
print("The tempered softmax reult (t2=4)")
print(y_pred_tempered_softmax_4)
print("")
print("The tempered softmax reult (t2=1.001)")
print(y_pred_tempered_softmax_1001)

### Bi-tempered logistic loss

* I integrate tempered softmax function and tempered logistic loss
* The closer t1 and t2 are to 1, the closer outputs are to that of categorical corss entropy with softmax

In [None]:
activation = [[0.001, 0.003, 0.002, 5.],
              [8.1, 0.003, 0.01, 0.0003],
              [2.0, 0.005, 0.006, 0.0001],
              [6.0, 0.01, 0.001, 0.001],
              [10., 0.0002, 0.002, 0.3],
              [5.3, 0.001, 0.4, 0.3]]
activation_tf = tf.cast(activation, tf.float32)
y_true = [[0., 0., 0., 1.],
          [1., 0., 0., 0.],
          [1., 0., 0., 0.],
          [1., 0., 0., 0.],
          [0., 1., 0., 0.],
          [0., 1., 0., 0.]]

print("Categorical cross entropy with softmax: %s" % ccel(y_true, softmax(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.2, t2=1.): %s" % btll_02(y_true, softmax(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.2, t2=1.001): %s" % btll_02(y_true, tempered_softmax_1001(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.2, t2=2.): %s" % btll_02(y_true, tempered_softmax_2(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.2, t2=4.): %s" % btll_02(y_true, tempered_softmax_4(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.8, t2=1.): %s" % btll_08(y_true, softmax(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.2, t2=1.001): %s" % btll_08(y_true, tempered_softmax_1001(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.8, t2=2.): %s" % btll_08(y_true, tempered_softmax_2(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.8, t2=4.): %s" % btll_08(y_true, tempered_softmax_4(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.999, t2=1.): %s" % btll_0999(y_true, softmax(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.999, t2=1.001): %s" % btll_0999(y_true, tempered_softmax_1001(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.999, t2=2.): %s" % btll_0999(y_true, tempered_softmax_2(activation_tf)).numpy())
print("Bi-tempered logistic loss (t1=0.999, t2=4.): %s" % btll_0999(y_true, tempered_softmax_4(activation_tf)).numpy())