# Training a Classifier on the *Salammb√¥* Dataset with PyTorch
Author: Pierre Nugues

We first need to import some modules

In [1]:
import torch
import torch.nn as nn
import numpy as np

### Reading the dataset
We can read the data from a file with the svmlight format or directly create numpy arrays

In [2]:
X = np.array(
    [[35680, 2217], [42514, 2761], [15162, 990], [35298, 2274],
     [29800, 1865], [40255, 2606], [74532, 4805], [37464, 2396],
     [31030, 1993], [24843, 1627], [36172, 2375], [39552, 2560],
     [72545, 4597], [75352, 4871], [18031, 1119], [36961, 2503],
     [43621, 2992], [15694, 1042], [36231, 2487], [29945, 2014],
     [40588, 2805], [75255, 5062], [37709, 2643], [30899, 2126],
     [25486, 1784], [37497, 2641], [40398, 2766], [74105, 5047],
     [76725, 5312], [18317, 1215]
     ])

y = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

## Scaling the Data
Scaling and normalizing are usually very significant with neural networks. We use sklean transformers. They consist of two main methods: `fit()` and `transform()`.

### Normalizing

In [3]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
X_norm = normalizer.fit_transform(X)
X_norm[:4]

array([[0.99807515, 0.06201605],
       [0.99789783, 0.06480679],
       [0.99787509, 0.06515607],
       [0.99793128, 0.06428964]])

### Standardizing

In [4]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)
X_scaled = scaler.fit_transform(X_norm)
X_scaled[:4]

array([[ 1.68336574, -1.7197772 ],
       [ 0.57376529, -0.56145427],
       [ 0.43143908, -0.41648279],
       [ 0.78308579, -0.77610221]])

## Creating PyTorch Tensors
PyTorch has its own implementation of matrices called tensors. They are more or less equivalent to NumPy arrays. We need to convert our dataset to these tensors and represent $\mathbf{y}$  as a column vector

In [5]:
Y = y.reshape((-1, 1))
Y[:4]

array([[0],
       [0],
       [0],
       [0]])

In [6]:
X_scaled = torch.Tensor(X_scaled)
X_scaled

tensor([[ 1.6834, -1.7198],
        [ 0.5738, -0.5615],
        [ 0.4314, -0.4165],
        [ 0.7831, -0.7761],
        [ 1.5095, -1.5348],
        [ 0.6568, -0.6464],
        [ 0.7646, -0.7571],
        [ 0.9700, -0.9692],
        [ 0.8610, -0.8564],
        [ 0.3516, -0.3355],
        [ 0.2834, -0.2665],
        [ 0.6618, -0.6515],
        [ 1.2025, -1.2115],
        [ 0.6947, -0.6852],
        [ 1.7127, -1.7511],
        [-0.5712,  0.5835],
        [-0.9400,  0.9424],
        [-0.0189,  0.0371],
        [-0.9622,  0.9639],
        [-0.3768,  0.3925],
        [-1.1617,  1.1560],
        [-0.3802,  0.3957],
        [-1.5856,  1.5599],
        [-1.0314,  1.0306],
        [-1.5463,  1.5228],
        [-1.7352,  1.7012],
        [-0.8880,  0.8921],
        [-0.7341,  0.7426],
        [-1.2155,  1.2076],
        [ 0.0071,  0.0112]])

In [7]:
Y = torch.Tensor(Y)
Y[:4]

tensor([[0.],
        [0.],
        [0.],
        [0.]])

## Creating a Model

We set a seed to have reproducible results

In [8]:
np.random.seed(1337)

We define a classifier equivalent to a logistic regression

In [9]:
class Model(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 1)
        
    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        return x

And a model with one hidden layer

In [10]:
class Model2(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 10)
        self.fc2 = nn.Linear(10, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

We create the model. To try the network with one hidden layer, set `complex` to true

In [11]:
complex = True

In [12]:
input_dim = X_scaled.shape[1]
if not complex:
    model = Model(input_dim)
else:
    model = Model2(input_dim)
loss_fn = nn.BCELoss()    # binary cross entropy loss
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

## Fitting the Model
We will show three methods: batch gradient descent, stochastic descent, and minibatches

### Batch Gradient Descent

We fit the whole dataset (batch gradient descent)

In [13]:
model.train()               # sets PyTorch in the train mode
for epoch in range(100):
    Y_pred = model(X_scaled)
    loss = loss_fn(Y_pred, Y)
    optimizer.zero_grad()   # resets the gradients
    loss.backward()         # gradient backpropagation
    optimizer.step()        # weight updates

### Stochastic Gradient Descent

or, we fit the model with a batch size of one item (stochastic gradient descent)

In [14]:
model.train()
for epoch in range(50):
    for x_scaled, y in zip(X_scaled, Y):
        y_pred = model(x_scaled)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

### Minibatch Gradient Descent

Or we fit it with mini-batches, first with a simple inner loop

In [15]:
batch_size = 4
model.train()
for epoch in range(50):
    # Would need to shuffle X and y
    for i in range(0, X_scaled.size()[0], batch_size):
        Y_batch_pred = model(X_scaled[i:i + batch_size])
        loss = loss_fn(Y_batch_pred, Y[i:i + batch_size])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Then with a dataloader

In [16]:
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(X_scaled, Y)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

In [17]:
model.train()
for epoch in range(50):
    for X_scaled_batch, Y_batch in dataloader:
        Y_batch_pred = model(X_scaled_batch)
        loss = loss_fn(Y_batch_pred, Y_batch)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

## The weights

In [18]:
list(model.parameters())

[Parameter containing:
 tensor([[-0.2581,  0.2969],
         [-0.4730, -0.4871],
         [ 0.8854, -0.1830],
         [ 0.0878, -0.7802],
         [-0.1503, -0.0709],
         [-1.0377,  1.0611],
         [ 1.2919, -0.7912],
         [-0.8118, -0.1599],
         [ 0.3107, -0.3579],
         [ 0.1399,  0.8829]], requires_grad=True),
 Parameter containing:
 tensor([-0.6654, -0.4556,  0.1112,  0.4948, -0.2618,  1.1851,  0.6857,  0.2688,
          0.4569,  0.1631], requires_grad=True),
 Parameter containing:
 tensor([[-0.0141,  0.2367, -0.5857, -0.4630,  0.1345,  1.6129, -1.4366,  0.4888,
          -0.3997,  0.5151]], requires_grad=True),
 Parameter containing:
 tensor([0.3099], requires_grad=True)]

Also in the form of a dictionary

In [19]:
model.state_dict()

OrderedDict([('fc1.weight',
              tensor([[-0.2581,  0.2969],
                      [-0.4730, -0.4871],
                      [ 0.8854, -0.1830],
                      [ 0.0878, -0.7802],
                      [-0.1503, -0.0709],
                      [-1.0377,  1.0611],
                      [ 1.2919, -0.7912],
                      [-0.8118, -0.1599],
                      [ 0.3107, -0.3579],
                      [ 0.1399,  0.8829]])),
             ('fc1.bias',
              tensor([-0.6654, -0.4556,  0.1112,  0.4948, -0.2618,  1.1851,  0.6857,  0.2688,
                       0.4569,  0.1631])),
             ('fc2.weight',
              tensor([[-0.0141,  0.2367, -0.5857, -0.4630,  0.1345,  1.6129, -1.4366,  0.4888,
                       -0.3997,  0.5151]])),
             ('fc2.bias', tensor([0.3099]))])

## Prediction

### Probabilities

We compute the probabilities to belong to class 1 for all the training set

In [20]:
model.eval()
y_pred_proba = model(X_scaled)
y_pred_proba[:4]

tensor([[0.0002],
        [0.0268],
        [0.0759],
        [0.0110]], grad_fn=<SliceBackward0>)

We recompute it with matrices

In [21]:
m_params = list(model.parameters())

In [22]:
if complex:
    print(torch.sigmoid(torch.relu(X_scaled @ m_params[0].T + m_params[1]) @ m_params[2].T + m_params[3])[:4])
else:
    print(torch.sigmoid(X_scaled @ m_params[0].T + m_params[1])[:4])

tensor([[0.0002],
        [0.0268],
        [0.0759],
        [0.0110]], grad_fn=<SliceBackward0>)


### Classes

In [23]:
def predict_class(y_pred_proba):
    y_pred = np.zeros(y_pred_proba.shape[0])
    for i in range(y_pred_proba.shape[0]):
        if y_pred_proba[i][0] >= 0.5:
            y_pred[i] = 1
    return y_pred

In [24]:
y_pred = predict_class(y_pred_proba)
y_pred

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

## Evaluation

In [25]:
from sklearn.metrics import classification_report

print(classification_report(Y, y_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        15
         1.0       1.00      1.00      1.00        15

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



We computed the accuracy from the training set. This is not a good practice. We should use a dedicated test set instead.