# Training a Classifier on the *Salammbô* Dataset with PyTorch
Author: Pierre Nugues

We first need to import some modules

In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np

### Reading the dataset
We can read the data from a file with the svmlight format or directly create numpy arrays

In [2]:
y_train = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

X_train = np.array(
    [[35680, 2217], [42514, 2761], [15162, 990], [35298, 2274],
     [29800, 1865], [40255, 2606], [74532, 4805], [37464, 2396],
     [31030, 1993], [24843, 1627], [36172, 2375], [39552, 2560],
     [72545, 4597], [75352, 4871], [18031, 1119], [36961, 2503],
     [43621, 2992], [15694, 1042], [36231, 2487], [29945, 2014],
     [40588, 2805], [75255, 5062], [37709, 2643], [30899, 2126],
     [25486, 1784], [37497, 2641], [40398, 2766], [74105, 5047],
     [76725, 5312], [18317, 1215]
     ],dtype=np.float32)

In [3]:
y_train = y_train.reshape((-1, 1))
y_train

array([[0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

## Scaling the Data
Scaling and normalizing are usually very significant with neural networks. We use sklean transformers. They consist of two main methods: `fit()` and `transform()`.

### Normalizing

In [4]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_train_norm

array([[0.9980751 , 0.06201605],
       [0.99789774, 0.06480679],
       [0.9978751 , 0.06515607],
       [0.9979313 , 0.06428964],
       [0.99804735, 0.06246169],
       [0.9979111 , 0.06460207],
       [0.9979283 , 0.06433539],
       [0.99796116, 0.06382433],
       [0.99794376, 0.06409609],
       [0.9978623 , 0.06535129],
       [0.9978515 , 0.06551746],
       [0.9979119 , 0.06458977],
       [0.99799836, 0.06324073],
       [0.9979172 , 0.06450863],
       [0.99807984, 0.06194062],
       [0.9977148 , 0.06756528],
       [0.9976559 , 0.06843004],
       [0.99780315, 0.06624894],
       [0.99765235, 0.06848173],
       [0.99774593, 0.06710504],
       [0.9976205 , 0.06894466],
       [0.9977454 , 0.06711297],
       [0.9975528 , 0.06991784],
       [0.9976413 , 0.06864253],
       [0.997559  , 0.06982835],
       [0.99752885, 0.07025825],
       [0.9976642 , 0.06830881],
       [0.99768883, 0.06794866],
       [0.99761194, 0.06906894],
       [0.9978073 , 0.06618638]], dtype=flo

### Standardizing

In [5]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True,with_std=True)
scaler.fit(X_train_norm)
X_train_scaled = scaler.transform(X_train_norm)
X_train_scaled

array([[ 1.6832309 , -1.7197777 ],
       [ 0.5732034 , -0.56145513],
       [ 0.431466  , -0.41648194],
       [ 0.7831985 , -0.7761007 ],
       [ 1.5094161 , -1.5348101 ],
       [ 0.65675384, -0.64642584],
       [ 0.76454884, -0.7571132 ],
       [ 0.97006804, -0.9692323 ],
       [ 0.861154  , -0.85643595],
       [ 0.35127246, -0.3354572 ],
       [ 0.28376073, -0.2664867 ],
       [ 0.66160274, -0.6515314 ],
       [ 1.2028158 , -1.2114625 ],
       [ 0.6947991 , -0.685208  ],
       [ 1.7126973 , -1.7510854 ],
       [-0.57151246,  0.58348024],
       [-0.9400297 ,  0.9424063 ],
       [-0.01873669,  0.03712195],
       [-0.96240926,  0.9638616 ],
       [-0.37681007,  0.3924546 ],
       [-1.1615876 ,  1.1560036 ],
       [-0.380167  ,  0.39574805],
       [-1.5853077 ,  1.5599338 ],
       [-1.031413  ,  1.0306025 ],
       [-1.5465164 ,  1.5227875 ],
       [-1.735251  ,  1.7012239 ],
       [-0.88818365,  0.89208937],
       [-0.7341374 ,  0.7426075 ],
       [-1.2152987 ,

## Fitting the Data

We set a seed to have reproducible results

In [6]:
np.random.seed(1337)

We define a classifier equivalent to a logistic regression

In [7]:
class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(input_dim, 1)
        
    def forward(self, x):
        x = torch.sigmoid(self.layer1(x))
        return x

We create PyTorch tensors

In [8]:
X_train_scaled = Variable(torch.Tensor(X_train_scaled).float())
y_train = Variable(torch.Tensor(y_train).float())

We create the model

In [9]:
input_dim = X_train_scaled.shape[1]
model = Model(input_dim)
loss_fn = nn.BCELoss()# binary cross entropy loss
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

We fit the model

In [10]:
for epoch in range(1000):
    y_train_pred = model(X_train_scaled)
    loss = loss_fn(y_train_pred, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

### The weights

In [11]:
list(model.parameters())

[Parameter containing:
 tensor([[-1.7364,  1.0800]], requires_grad=True),
 Parameter containing:
 tensor([0.1754], requires_grad=True)]

## We evaluate the model

We compute the probabilities to belong to class 1 for all the training set

In [12]:
predicted_probs = model(X_train_scaled)
predicted_probs

tensor([[0.0099],
        [0.1937],
        [0.2643],
        [0.1168],
        [0.0163],
        [0.1593],
        [0.1224],
        [0.0720],
        [0.0958],
        [0.3107],
        [0.3532],
        [0.1575],
        [0.0384],
        [0.1454],
        [0.0091],
        [0.8579],
        [0.9440],
        [0.5617],
        [0.9472],
        [0.7779],
        [0.9690],
        [0.7795],
        [0.9902],
        [0.9560],
        [0.9891],
        [0.9935],
        [0.9359],
        [0.9048],
        [0.9731],
        [0.5435]], grad_fn=<SigmoidBackward>)

In [13]:
from sklearn.metrics import accuracy_score

In [14]:
def predict_class(preds):
    c = []
    for x in range(len(preds)):
        if(preds[x] >= 0.5):
            c += [1]
        else:
            c += [0]
    return np.array(c)

In [15]:
classes = predict_class(predicted_probs)
classes

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])

In [16]:
accuracy_score(y_train, classes)

1.0

We computed the accuracy from the training set. This is not a good practice. We should use a dedicated test set instead.