# Training a Classifier on the *Salammb√¥* Dataset with PyTorch
Author: Pierre Nugues

We first need to import some modules

In [1]:
import torch
import torch.nn as nn
import numpy as np

### Reading the dataset
We can read the data from a file with the svmlight format or directly create numpy arrays

In [2]:
y_train = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

X_train = np.array(
    [[35680, 2217], [42514, 2761], [15162, 990], [35298, 2274],
     [29800, 1865], [40255, 2606], [74532, 4805], [37464, 2396],
     [31030, 1993], [24843, 1627], [36172, 2375], [39552, 2560],
     [72545, 4597], [75352, 4871], [18031, 1119], [36961, 2503],
     [43621, 2992], [15694, 1042], [36231, 2487], [29945, 2014],
     [40588, 2805], [75255, 5062], [37709, 2643], [30899, 2126],
     [25486, 1784], [37497, 2641], [40398, 2766], [74105, 5047],
     [76725, 5312], [18317, 1215]
     ], dtype=np.float32)

In [3]:
y_train = y_train.reshape((-1, 1))
y_train

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

## Scaling the Data
Scaling and normalizing are usually very significant with neural networks. We use sklean transformers. They consist of two main methods: `fit()` and `transform()`.

### Normalizing

In [4]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_train_norm

array([[0.9980751 , 0.06201605],
       [0.99789774, 0.06480679],
       [0.9978751 , 0.06515607],
       [0.9979313 , 0.06428964],
       [0.99804735, 0.06246169],
       [0.9979111 , 0.06460207],
       [0.9979283 , 0.06433539],
       [0.99796116, 0.06382433],
       [0.99794376, 0.06409609],
       [0.9978623 , 0.06535129],
       [0.9978515 , 0.06551746],
       [0.9979119 , 0.06458977],
       [0.99799836, 0.06324073],
       [0.9979172 , 0.06450863],
       [0.99807984, 0.06194062],
       [0.9977148 , 0.06756528],
       [0.9976559 , 0.06843004],
       [0.99780315, 0.06624894],
       [0.99765235, 0.06848173],
       [0.99774593, 0.06710504],
       [0.9976205 , 0.06894466],
       [0.9977454 , 0.06711297],
       [0.9975528 , 0.06991784],
       [0.9976413 , 0.06864253],
       [0.997559  , 0.06982835],
       [0.99752885, 0.07025825],
       [0.9976642 , 0.06830881],
       [0.99768883, 0.06794866],
       [0.99761194, 0.06906894],
       [0.9978073 , 0.06618638]], dtype=flo

### Standardizing

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(X_train_norm)
X_train_scaled = scaler.transform(X_train_norm)
X_train_scaled

array([[ 1.6833179 , -1.7197781 ],
       [ 0.5732904 , -0.56145555],
       [ 0.43155304, -0.4164824 ],
       [ 0.78328556, -0.7761012 ],
       [ 1.5095031 , -1.5348104 ],
       [ 0.65684086, -0.64642626],
       [ 0.76463586, -0.75711364],
       [ 0.97015506, -0.96923274],
       [ 0.8612411 , -0.8564363 ],
       [ 0.35135952, -0.3354576 ],
       [ 0.28384775, -0.26648712],
       [ 0.66168976, -0.6515319 ],
       [ 1.2029028 , -1.2114629 ],
       [ 0.69488615, -0.68520844],
       [ 1.7127844 , -1.7510858 ],
       [-0.57142544,  0.5834798 ],
       [-0.93994266,  0.9424059 ],
       [-0.01864966,  0.03712154],
       [-0.96232224,  0.96386117],
       [-0.37672305,  0.3924542 ],
       [-1.1615006 ,  1.1560031 ],
       [-0.38007998,  0.39574763],
       [-1.5852207 ,  1.5599334 ],
       [-1.0313259 ,  1.030602  ],
       [-1.5464294 ,  1.5227871 ],
       [-1.7351639 ,  1.7012235 ],
       [-0.88809663,  0.89208895],
       [-0.73405045,  0.74260706],
       [-1.2152115 ,

## Fitting the Data

We set a seed to have reproducible results

In [6]:
np.random.seed(1337)
torch.manual_seed(1234)

<torch._C.Generator at 0x1204374f0>

We define a classifier equivalent to a logistic regression

In [7]:
class Model(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layer1 = nn.Linear(input_dim, 1)

    def forward(self, x):
        x = torch.sigmoid(self.layer1(x))
        return x

We create PyTorch tensors

In [8]:
X_train_scaled = torch.from_numpy(X_train_scaled).float()
y_train = torch.from_numpy(y_train).float()

We create the model

In [9]:
input_dim = X_train_scaled.shape[1]
model = Model(input_dim)
loss_fn = nn.BCELoss()  # binary cross entropy loss
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

We fit the model with a batch size of one item

In [10]:
model.train()
for epoch in range(50):
    for x_train_scaled, y_t in zip(X_train_scaled, y_train):
        y_train_pred = model(X_train_scaled)
        loss = loss_fn(y_train_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

### The weights

In [11]:
list(model.parameters())

[Parameter containing:
 tensor([[-2.0757,  1.2755]], requires_grad=True),
 Parameter containing:
 tensor([0.0259], requires_grad=True)]

## We evaluate the model

We compute the probabilities to belong to class 1 for all the training set

In [12]:
model.eval()
predicted_probs = model(X_train_scaled)
predicted_probs

tensor([[0.0035],
        [0.1324],
        [0.1976],
        [0.0698],
        [0.0063],
        [0.1032],
        [0.0740],
        [0.0383],
        [0.0545],
        [0.2439],
        [0.2884],
        [0.1017],
        [0.0177],
        [0.0919],
        [0.0031],
        [0.8761],
        [0.9600],
        [0.5280],
        [0.9628],
        [0.7873],
        [0.9804],
        [0.7891],
        [0.9951],
        [0.9702],
        [0.9944],
        [0.9970],
        [0.9529],
        [0.9239],
        [0.9835],
        [0.5062]], grad_fn=<SigmoidBackward0>)

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
def predict_class(preds):
    c = []
    for x in range(len(preds)):
        if (preds[x] >= 0.5):
            c += [1]
        else:
            c += [0]
    return np.array(c)

In [15]:
classes = predict_class(predicted_probs)
classes

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])

In [16]:
accuracy_score(y_train, classes)

1.0

We computed the accuracy from the training set. This is not a good practice. We should use a dedicated test set instead.