# Training a Classifier on the *Salammbô* Dataset with Keras
Author: Pierre Nugues

We use three classes: French, English, and German

We first need to import some modules

In [1]:
from tensorflow import keras
from tensorflow.keras import layers
import numpy as np

### Reading the dataset
We can read the data from a file with the svmlight format or directly create numpy arrays

In [2]:
X = np.array(
    [[35680, 2217], [42514, 2761], [15162, 990], [35298, 2274],
     [29800, 1865], [40255, 2606], [74532, 4805], [37464, 2396],
     [31030, 1993], [24843, 1627], [36172, 2375], [39552, 2560],
     [72545, 4597], [75352, 4871], [18031, 1119], [36961, 2503],
     [43621, 2992], [15694, 1042], [36231, 2487], [29945, 2014],
     [40588, 2805], [75255, 5062], [37709, 2643], [30899, 2126],
     [25486, 1784], [37497, 2641], [40398, 2766], [74105, 5047],
     [76725, 5312], [18317, 1215]
     ])

y = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

We add German data and we adjust `y`

In [3]:
X_de = np.array(
    [[37599, 1771], [44565, 2116], [16156, 715], [37697, 1804],
     [29800, 1865], [42606, 2146], [78242, 3813], [40341, 1955],
     [31030, 1993], [26676, 1346], [39250, 1902], [41780, 2106],
     [72545, 4597], [79195, 3988], [19020, 928]
     ])

X = np.vstack((X, X_de))

y = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Scaling the Data
Scaling and normalizing are usually very significant with neural networks. We use sklean transformers. They consist of two main methods: `fit()` and `transform()`.

### Normalizing

In [4]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
X_norm = normalizer.fit_transform(X)
X_norm[:4]



array([[0.99807515, 0.06201605],
       [0.99789783, 0.06480679],
       [0.99787509, 0.06515607],
       [0.99793128, 0.06428964]])

### Standardizing

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)
X_scaled = scaler.fit_transform(X_norm)
X_scaled[:4]

array([[-0.03108396,  0.0944527 ],
       [-0.4126595 ,  0.44232074],
       [-0.46160343,  0.48585864],
       [-0.34067721,  0.37785758]])

In [6]:
Y_cat = keras.utils.to_categorical(y)
Y_cat[:4]

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]], dtype=float32)

## Creating a Model

We set a seed to have reproducible results

In [7]:
np.random.seed(1337)

We create a classifier equivalent to a logistic regression with `softmax`

In [8]:
model = keras.Sequential([
        layers.Dense(3, activation='softmax')
    ])

2022-09-01 21:38:41.496286: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Or with one hidden layer

In [9]:
model2 = keras.Sequential([
        layers.Dense(10, activation='relu'),
        # layers.Dropout(0.5),
        layers.Dense(3, activation='softmax')
    ])

To try the network with one hidden layer, set `complex` to true

In [10]:
complex = True
if complex == True:
    model = model2

## Fitting the Model

We compile and fit the model

In [11]:
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(X_scaled, Y_cat, epochs=30, batch_size=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f85197cd8e0>

### The weights

In [12]:
model.get_weights()

[array([[ 0.57191426,  0.60600066, -0.38973573,  0.25049552,  0.6779418 ,
          0.58679605,  0.12810422,  0.6276153 , -0.6075459 , -1.1770915 ],
        [-0.3518648 ,  0.32574832, -0.2183541 ,  0.477886  , -0.73888886,
         -0.7050051 , -0.36574027,  0.6027073 ,  0.02931863,  0.55345607]],
       dtype=float32),
 array([ 0.03520421,  0.09092899, -0.23099695, -0.09910319,  0.08919332,
         0.3221798 ,  0.03708989, -0.04844185,  0.3929666 , -0.5140454 ],
       dtype=float32),
 array([[-0.4739945 ,  0.21661673,  0.19615576],
        [ 0.02035217,  0.17963284,  0.54971784],
        [ 0.48003533, -0.19417204,  0.5744656 ],
        [-0.47567087, -0.3352624 , -0.33981088],
        [-0.54159695, -0.53238064,  0.5559826 ],
        [ 0.19307546, -0.445962  ,  0.95215404],
        [-0.39203918, -0.31261498,  0.34449145],
        [ 0.04308302,  0.56344324, -0.41226292],
        [-0.0287454 , -0.08729865, -0.8919198 ],
        [-0.93765235,  1.2449181 ,  0.19932221]], dtype=float32),
 

## Prediction
### Probabilities

We compute the probabilities to belong to the classes for all the training set

In [13]:
Y_pred_proba = model.predict(X_scaled)



In [14]:
np.set_printoptions(precision=3, suppress=True)
Y_pred_proba

array([[0.622, 0.236, 0.142],
       [0.533, 0.367, 0.1  ],
       [0.494, 0.406, 0.1  ],
       [0.588, 0.312, 0.101],
       [0.624, 0.247, 0.129],
       [0.555, 0.344, 0.1  ],
       [0.583, 0.316, 0.101],
       [0.628, 0.271, 0.101],
       [0.607, 0.292, 0.101],
       [0.472, 0.429, 0.099],
       [0.454, 0.448, 0.098],
       [0.557, 0.343, 0.1  ],
       [0.625, 0.266, 0.109],
       [0.565, 0.334, 0.1  ],
       [0.62 , 0.234, 0.147],
       [0.24 , 0.683, 0.077],
       [0.17 , 0.765, 0.065],
       [0.372, 0.535, 0.093],
       [0.167, 0.769, 0.064],
       [0.283, 0.634, 0.083],
       [0.137, 0.805, 0.058],
       [0.282, 0.635, 0.083],
       [0.088, 0.867, 0.045],
       [0.156, 0.782, 0.062],
       [0.091, 0.862, 0.046],
       [0.074, 0.885, 0.041],
       [0.179, 0.754, 0.067],
       [0.207, 0.721, 0.072],
       [0.129, 0.814, 0.056],
       [0.379, 0.528, 0.093],
       [0.004, 0.001, 0.995],
       [0.005, 0.002, 0.994],
       [0.001, 0.   , 0.998],
       [0.

We recompute it with matrices

In [15]:
from tensorflow.keras.activations import softmax, relu
if complex:
    print(softmax((relu(X_scaled@model.get_weights()[0] + model.get_weights()[1]))@model.get_weights()[2] + model.get_weights()[3])[:4])
else:
    print(softmax((X_scaled@model.get_weights()[0] + model.get_weights()[1]))[:4])

tf.Tensor(
[[0.622 0.236 0.142]
 [0.533 0.367 0.1  ]
 [0.494 0.406 0.1  ]
 [0.588 0.312 0.101]], shape=(4, 3), dtype=float64)


### Classes

In [16]:
y_pred = np.argmax(Y_pred_proba, axis=-1)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2,
       2])

## Loss
We recompute the loss

For one observation

In [17]:
- Y_cat[0] @ np.log(Y_pred_proba[0]).T

0.47531605

For the dataset

In [18]:
-np.sum(Y_cat * np.log(Y_pred_proba)) / Y_cat.shape[0]

0.4451290554470486

## Evaluation

With Keras

In [19]:
model.evaluate(X_scaled, Y_cat)



[0.44512906670570374, 0.9333333373069763]

With sklearn

In [20]:
from sklearn.metrics import classification_report

print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91        15
           1       1.00      1.00      1.00        15
           2       1.00      0.80      0.89        15

    accuracy                           0.93        45
   macro avg       0.94      0.93      0.93        45
weighted avg       0.94      0.93      0.93        45



We computed the accuracy from the training set. This is not a good practice. We should use a dedicated test set instead.