### Import Libraries

In [10]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)
from sklearn.datasets import make_blobs

### Problem Statement
This lab introduces 2 ways of importing softmax, cross-entropy loss in TensorFlow
- The 'obvious' method: straightforward to implement
- The 'preferred' method: numerically stable to implement

##### Obvious Method
- Softmax is an activation in the final dense layer
- The loss function is separately specified in the compile directive
- The loss function is SparseCategoricalCrossEntropy
- The loss function takes the softmax output (which is a vector of probs)

In [11]:
def my_softmax(z):
    ez = np.exp(z)              #element-wise exponenial
    sm = ez/np.sum(ez)
    return(sm)

In [12]:
# make  dataset for example
centers = [[-5, 2], [-2, -2], [1, 2], [5, -2]]
X_train, y_train = make_blobs(n_samples=2000, centers=centers, cluster_std=1.0,random_state=30)

In [13]:
model = Sequential(
    [ 
        Dense(25, activation = 'relu'),
        Dense(15, activation = 'relu'),
        Dense(4, activation = 'softmax')    # < softmax activation here
    ]
)
model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
)

model.fit(
    X_train,y_train,
    epochs=10
)

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 1.4108 
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 768us/step - loss: 0.5722
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 957us/step - loss: 0.1903
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 697us/step - loss: 0.1086
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 893us/step - loss: 0.0738
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 597us/step - loss: 0.0645
Epoch 7/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 601us/step - loss: 0.0559
Epoch 8/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 589us/step - loss: 0.0441
Epoch 9/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 590us/step - loss: 0.0402
Epoch 10/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 768us/step - los

<keras.src.callbacks.history.History at 0x25608fad5d0>

In [14]:
p_nonpreferred = model.predict(X_train)
print(p_nonpreferred [:2])
print("largest value", np.max(p_nonpreferred), "smallest value", np.min(p_nonpreferred))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
[[3.4422115e-03 6.5101660e-03 9.7329980e-01 1.6747912e-02]
 [9.9420905e-01 5.6993505e-03 3.5875004e-05 5.5737473e-05]]
largest value 0.9999987 smallest value 5.78011e-08


##### Preferred Method
- More stable and accurate results can be obtained if softmax and loss are combined during training
- In this method, the final layer has linear activation. The outputs in this form are referred to as LOGITS. 
    - Logits are not probabilities
    - Logits range from large positive to large negative numbers
- The loss function has an additional argument: from_logits=True. This informs the loss fn that the softmax operation should be included in the loss calculation, thus allowing an optimized implementation
- During inference, to convert the output to probabilities, we use model.predict() to get the logits and then apply the softmax function to get the final probabilities
- Use np.argmax after the above step to predict the final category

In [15]:
preferred_model = Sequential(
    [ 
        Dense(25, activation = 'relu'),
        Dense(15, activation = 'relu'),
        Dense(4, activation = 'linear')   #<-- Note
    ]
)
preferred_model.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),  #<-- Note
    optimizer=tf.keras.optimizers.Adam(0.001),
)

preferred_model.fit(
    X_train,y_train,
    epochs=10
)

Epoch 1/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 1.2722  
Epoch 2/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.6646
Epoch 3/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.3068
Epoch 4/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.1534
Epoch 5/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0954  
Epoch 6/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0785
Epoch 7/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 893us/step - loss: 0.0621
Epoch 8/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0661
Epoch 9/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 885us/step - loss: 0.0556
Epoch 10/10
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.0424


<keras.src.callbacks.history.History at 0x25609e440d0>

In [16]:
p_preferred = preferred_model.predict(X_train)
print(f"two example output vectors:\n {p_preferred[:2]}")
print("largest value", np.max(p_preferred), "smallest value", np.min(p_preferred))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
two example output vectors:
 [[-3.2156823  -5.1317887   3.1991339  -0.1272727 ]
 [ 4.9159737   0.36876115 -2.3493698  -2.4800317 ]]
largest value 10.581799 smallest value -17.46659


In [17]:
sm_preferred = tf.nn.softmax(p_preferred).numpy()
print(f"two example output vectors:\n {sm_preferred[:2]}")
print("largest value", np.max(sm_preferred), "smallest value", np.min(sm_preferred))

two example output vectors:
 [[1.5774921e-03 2.3217339e-04 9.6357685e-01 3.4613568e-02]
 [9.8823035e-01 1.0471982e-02 6.9112965e-04 6.0647615e-04]]
largest value 0.9999993 smallest value 1.9997049e-12


In [18]:
for i in range(5):
    print( f"{p_preferred[i]}, category: {np.argmax(p_preferred[i])}")

[-3.2156823 -5.1317887  3.1991339 -0.1272727], category: 2
[ 4.9159737   0.36876115 -2.3493698  -2.4800317 ], category: 0
[ 3.6377347   0.54282904 -1.8117585  -2.0465922 ], category: 0
[-2.3068862  4.0382013 -1.3971614 -0.9748617], category: 1
[-2.1581862 -5.289384   4.017906  -2.3515913], category: 2


Note: TF has 2 potential format of target values and the selection of loss defines which is expected:
1. SparseCategoricalCrossEntropy - expects target to be an **integer** corresponding to the index. For example, if there are 10 potential target values, y should be between 0 & 9.
2. CategoricalCrossEntropy - expects target value of an example to be 1 hot encoded whether the value at the target index is 1 and the other N-1 entries are 0. An example with 5 potential target values, where the target is 2, would be [0, 0, 1, 0, 0]. Use this for mult-label classification.

Summary
- The preferred model costruction is as follows:
1. No activation in the final layer - same as linear activation
2. Use SparseCategoricalCrossEntropy
3. Use from_logits=True
- Unlike Relu and Sigmoid, Softmax spans multiple outputs