# Training a Classifier on the *Salammbô* Dataset with Keras
Author: Pierre Nugues

We first need to import some modules

In [1]:
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
import numpy as np

### Reading the dataset
We can read the data from a file with the svmlight format or directly create numpy arrays

In [2]:
y_train = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

X_train = np.array(
    [[35680, 2217], [42514, 2761], [15162, 990], [35298, 2274],
     [29800, 1865], [40255, 2606], [74532, 4805], [37464, 2396],
     [31030, 1993], [24843, 1627], [36172, 2375], [39552, 2560],
     [72545, 4597], [75352, 4871], [18031, 1119], [36961, 2503],
     [43621, 2992], [15694, 1042], [36231, 2487], [29945, 2014],
     [40588, 2805], [75255, 5062], [37709, 2643], [30899, 2126],
     [25486, 1784], [37497, 2641], [40398, 2766], [74105, 5047],
     [76725, 5312], [18317, 1215]
     ],dtype=np.float32)

## Scaling the Data
Scaling and normalizing are usually very significant with neural networks. We use sklean transformers. They consist of two main methods: `fit()` and `transform()`.

### Normalizing

In [3]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_train_norm

array([[0.9980751 , 0.06201605],
       [0.99789774, 0.06480679],
       [0.9978751 , 0.06515607],
       [0.9979313 , 0.06428964],
       [0.99804735, 0.06246169],
       [0.9979111 , 0.06460207],
       [0.9979283 , 0.06433539],
       [0.99796116, 0.06382433],
       [0.99794376, 0.06409609],
       [0.9978623 , 0.06535129],
       [0.9978515 , 0.06551746],
       [0.9979119 , 0.06458977],
       [0.99799836, 0.06324073],
       [0.9979172 , 0.06450863],
       [0.99807984, 0.06194062],
       [0.9977148 , 0.06756528],
       [0.9976559 , 0.06843004],
       [0.99780315, 0.06624894],
       [0.99765235, 0.06848173],
       [0.99774593, 0.06710504],
       [0.9976205 , 0.06894466],
       [0.9977454 , 0.06711297],
       [0.9975528 , 0.06991784],
       [0.9976413 , 0.06864253],
       [0.997559  , 0.06982835],
       [0.99752885, 0.07025825],
       [0.9976642 , 0.06830881],
       [0.99768883, 0.06794866],
       [0.99761194, 0.06906894],
       [0.9978073 , 0.06618638]], dtype=flo

### Standardizing

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True,with_std=True)
scaler.fit(X_train_norm)
X_train_scaled = scaler.transform(X_train_norm)
X_train_scaled

array([[ 1.6832309 , -1.7197777 ],
       [ 0.5732034 , -0.56145513],
       [ 0.431466  , -0.41648194],
       [ 0.7831985 , -0.7761007 ],
       [ 1.5094161 , -1.5348101 ],
       [ 0.65675384, -0.64642584],
       [ 0.76454884, -0.7571132 ],
       [ 0.97006804, -0.9692323 ],
       [ 0.861154  , -0.85643595],
       [ 0.35127246, -0.3354572 ],
       [ 0.28376073, -0.2664867 ],
       [ 0.66160274, -0.6515314 ],
       [ 1.2028158 , -1.2114625 ],
       [ 0.6947991 , -0.685208  ],
       [ 1.7126973 , -1.7510854 ],
       [-0.57151246,  0.58348024],
       [-0.9400297 ,  0.9424063 ],
       [-0.01873669,  0.03712195],
       [-0.96240926,  0.9638616 ],
       [-0.37681007,  0.3924546 ],
       [-1.1615876 ,  1.1560036 ],
       [-0.380167  ,  0.39574805],
       [-1.5853077 ,  1.5599338 ],
       [-1.031413  ,  1.0306025 ],
       [-1.5465164 ,  1.5227875 ],
       [-1.735251  ,  1.7012239 ],
       [-0.88818365,  0.89208937],
       [-0.7341374 ,  0.7426075 ],
       [-1.2152987 ,

## Fitting the Data

We set a seed to have reproducible results

In [5]:
np.random.seed(1337)

We create a classifier equivalent to a logistic regression and we fit a model

In [6]:
model = Sequential([
    Dense(1, input_dim=2, activation='sigmoid')]
)

model.compile(loss='binary_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
model.fit(X_train_scaled, y_train, epochs=30, batch_size=1, verbose=False)

2021-09-10 18:20:32.496638: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-09-10 18:20:33.022663: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


<keras.callbacks.History at 0x7fb2f82a6e50>

### The weights

In [7]:
model.get_weights()

[array([[-0.9763438],
        [ 1.853584 ]], dtype=float32),
 array([0.03650518], dtype=float32)]

## We evaluate the model

We compute the probabilities to belong to class 1 for all the training set

In [8]:
predicted_probs = model.predict(X_train_scaled, batch_size=1)
predicted_probs

array([[0.00820595],
       [0.17309597],
       [0.23926044],
       [0.10277784],
       [0.01362538],
       [0.14149636],
       [0.1078079 ],
       [0.06255165],
       [0.08380368],
       [0.28327876],
       [0.32420838],
       [0.13978043],
       [0.0328182 ],
       [0.12876213],
       [0.00752878],
       [0.8423778 ],
       [0.93709314],
       [0.5308625 ],
       [0.9406292 ],
       [0.7561799 ],
       [0.9648844 ],
       [0.7579056 ],
       [0.98874676],
       [0.9504441 ],
       [0.98749506],
       [0.9924906 ],
       [0.92805845],
       [0.89376235],
       [0.9695699 ],
       [0.5124932 ]], dtype=float32)

In [None]:
from sklearn.metrics import accuracy_score

In [10]:
def predict_class(preds):
    c = []
    for x in range(len(preds)):
        if(preds[x] >= 0.5):
            c += [1]
        else:
            c += [0]
    return np.array(c)

In [11]:
classes = predict_class(predicted_probs)
classes

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])

In [12]:
accuracy_score(y_train, classes)

1.0

We computed the accuracy from the training set. This is not a good practice. We should use a dedicated test set instead.