# Training a Classifier on the *Salammbô* Dataset with PyTorch
Author: Pierre Nugues

We use three classes: French, English, and German

We first need to import some modules

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

### Reading the dataset
We can read the data from a file with the svmlight format or directly create numpy arrays

In [2]:
X = np.array(
    [[35680, 2217], [42514, 2761], [15162, 990], [35298, 2274],
     [29800, 1865], [40255, 2606], [74532, 4805], [37464, 2396],
     [31030, 1993], [24843, 1627], [36172, 2375], [39552, 2560],
     [72545, 4597], [75352, 4871], [18031, 1119], [36961, 2503],
     [43621, 2992], [15694, 1042], [36231, 2487], [29945, 2014],
     [40588, 2805], [75255, 5062], [37709, 2643], [30899, 2126],
     [25486, 1784], [37497, 2641], [40398, 2766], [74105, 5047],
     [76725, 5312], [18317, 1215]
     ])

We add German data and we adjust `y`

In [3]:
X_de = np.array(
    [[37599, 1771], [44565, 2116], [16156, 715], [37697, 1804],
     [29800, 1865], [42606, 2146], [78242, 3813], [40341, 1955],
     [31030, 1993], [26676, 1346], [39250, 1902], [41780, 2106],
     [72545, 4597], [79195, 3988], [19020, 928]
     ])

X = np.vstack((X, X_de))

y = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Scaling the Data
Scaling and normalizing are usually very significant with neural networks. We use sklean transformers. They consist of two main methods: `fit()` and `transform()`.

### Normalizing

In [4]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
X_norm = normalizer.fit_transform(X)
X_norm[:4]



array([[0.99807515, 0.06201605],
       [0.99789783, 0.06480679],
       [0.99787509, 0.06515607],
       [0.99793128, 0.06428964]])

### Standardizing

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)
X_scaled = scaler.fit_transform(X_norm)
X_scaled[:4]

array([[-0.03108396,  0.0944527 ],
       [-0.4126595 ,  0.44232074],
       [-0.46160343,  0.48585864],
       [-0.34067721,  0.37785758]])

In [6]:
X_scaled = torch.Tensor(X_scaled)
y = torch.LongTensor(y)
y

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

## Creating a Model

We set a seed to have reproducible results

In [7]:
np.random.seed(1337)

We create a classifier equivalent to a logistic regression. With PyTorch, the crossentropy loss computes the softmax of the outputs. We do not add an activation in the last layer.

In [8]:
class Model(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 3)
        
    def forward(self, x):
        x = self.fc1(x)
        return x

Or with one hidden layer

In [9]:
class Model2(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 10)
        self.fc2 = nn.Linear(10, 3)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

To try the network with one hidden layer, set `complex` to true

In [10]:
complex = True

In [11]:
input_dim = X_scaled.shape[1]
if not complex:
    model = Model(input_dim)
else:
    model = Model2(input_dim)
loss_fn = nn.CrossEntropyLoss()    # cross entropy loss
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

## Fitting the Model

In [12]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_scaled, y)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

We fit the model

In [13]:
for epoch in range(100):
    loss_train = 0
    for X_scaled_batch, y_batch in dataloader:
        y_batch_pred = model(X_scaled_batch)
        loss = loss_fn(y_batch_pred, y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_train += loss.item()
    if epoch % 10 == 0:
        print(loss_train/len(y))
print(loss_train/len(y))

1.1144217756059436
0.6301314734750324
0.48248221740747493
0.3854936653592934
0.3299453552424287
0.29637347696762945
0.27467437639651404
0.2598221229130609
0.24856676284624782
0.24027480838380547
0.23393825554101366


### The weights

In [14]:
model.state_dict()

OrderedDict([('fc1.weight',
              tensor([[ 0.9028, -0.8251],
                      [ 0.2049, -0.3287],
                      [ 0.6561, -0.5856],
                      [-0.5527, -0.5252],
                      [-0.4681, -0.5736],
                      [ 0.0182, -0.8083],
                      [-0.5353, -0.1462],
                      [-1.7000,  1.7545],
                      [ 0.5046, -0.7802],
                      [ 0.7776, -0.9957]])),
             ('fc1.bias',
              tensor([-0.3950, -0.7090, -0.3700, -0.2123, -0.2808, -0.4309, -0.5964, -0.7815,
                       0.0769,  1.6403])),
             ('fc2.weight',
              tensor([[-0.2916,  0.0715, -0.2203, -0.2371,  0.1895, -0.3670,  0.0598, -0.7319,
                       -0.5844,  0.7279],
                      [-0.0359, -0.0161, -0.4614,  0.3110,  0.1629,  0.0484, -0.2127,  2.0011,
                       -0.0665, -1.4676],
                      [ 0.8433, -0.0402,  0.2970, -0.1923, -0.1104,  0.1559, -0.2938

## Prediction
### Probabilities

We compute the probabilities to belong to the classes for all the training set

In [15]:
model.eval()

Model2(
  (fc1): Linear(in_features=2, out_features=10, bias=True)
  (fc2): Linear(in_features=10, out_features=3, bias=True)
)

The output with no activation

In [16]:
Y_pred_logits = model(X_scaled)
Y_pred_logits[:4]

tensor([[ 2.3403, -2.8232,  1.1642],
        [ 1.3628, -0.4866, -0.4976],
        [ 1.1867, -0.0478, -0.8030],
        [ 1.6225, -1.1341, -0.0469]], grad_fn=<SliceBackward0>)

The probabilities

In [17]:
Y_pred_proba = F.softmax(model(X_scaled))
Y_pred_proba[:4]

  Y_pred_proba = F.softmax(model(X_scaled))


tensor([[0.7609, 0.0044, 0.2347],
        [0.7616, 0.1198, 0.1185],
        [0.7004, 0.2038, 0.0958],
        [0.7988, 0.0507, 0.1505]], grad_fn=<SliceBackward0>)

We recompute it with matrices

In [18]:
m_params = list(model.parameters())

In [19]:
if complex:
    print(torch.softmax(torch.relu(X_scaled @ m_params[0].T + m_params[1]) @ m_params[2].T + m_params[3], dim=-1)[:4])
else:
    print(torch.softmax(X_scaled @ m_params[0].T + m_params[1], dim=-1)[:4])

tensor([[0.7609, 0.0044, 0.2347],
        [0.7616, 0.1198, 0.1185],
        [0.7004, 0.2038, 0.0958],
        [0.7988, 0.0507, 0.1505]], grad_fn=<SliceBackward0>)


### Classes

In [20]:
y_pred = torch.argmax(Y_pred_proba, dim=-1)
y_pred

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 0, 2, 2])

## Loss
We recompute the loss

For one observation

In [21]:
loss_fn(Y_pred_logits[0], y[0])

tensor(0.2732, grad_fn=<NllLossBackward0>)

In [22]:
-torch.log(Y_pred_proba[0])[y[0]]

tensor(0.2732, grad_fn=<NegBackward0>)

For the dataset

In [23]:
loss_fn(Y_pred_logits, y)

tensor(0.2308, grad_fn=<NllLossBackward0>)

In [24]:
-torch.mean(torch.log(Y_pred_proba[range(0, len(y)), y]))

tensor(0.2308, grad_fn=<NegBackward0>)

## Evaluation

With sklearn

In [25]:
from sklearn.metrics import classification_report

print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.83      1.00      0.91        15
           1       1.00      1.00      1.00        15
           2       1.00      0.80      0.89        15

    accuracy                           0.93        45
   macro avg       0.94      0.93      0.93        45
weighted avg       0.94      0.93      0.93        45



We computed the accuracy from the training set. This is not a good practice. We should use a dedicated test set instead.