# Training a Classifier on the *Salammbô* Dataset with PyTorch
Author: Pierre Nugues

We first need to import some modules

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np

### Reading the dataset
We can read the data from a file with the svmlight format or directly create numpy arrays

In [None]:
y_train = np.array(
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

X_train = np.array(
    [[35680, 2217], [42514, 2761], [15162, 990], [35298, 2274],
     [29800, 1865], [40255, 2606], [74532, 4805], [37464, 2396],
     [31030, 1993], [24843, 1627], [36172, 2375], [39552, 2560],
     [72545, 4597], [75352, 4871], [18031, 1119], [36961, 2503],
     [43621, 2992], [15694, 1042], [36231, 2487], [29945, 2014],
     [40588, 2805], [75255, 5062], [37709, 2643], [30899, 2126],
     [25486, 1784], [37497, 2641], [40398, 2766], [74105, 5047],
     [76725, 5312], [18317, 1215]
     ],dtype=np.float32)

In [None]:
y_train = y_train.reshape((-1, 1))
y_train

## Scaling the Data
Scaling and normalizing are usually very significant with neural networks. We use sklean transformers. They consist of two main methods: `fit()` and `transform()`.

### Normalizing

In [None]:
from sklearn.preprocessing import Normalizer
normalizer = Normalizer()
normalizer.fit(X_train)
X_train_norm = normalizer.transform(X_train)
X_train_norm

### Standardizing

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler(with_mean=True,with_std=True)
scaler.fit(X_train_norm)
X_train_scaled = scaler.transform(X_train_norm)
X_train_scaled

## Fitting the Data

We set a seed to have reproducible results

In [None]:
np.random.seed(1337)

We define a classifier equivalent to a logistic regression

In [None]:
class Model(nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(input_dim, 1)
        
    def forward(self, x):
        x = torch.sigmoid(self.layer1(x))
        return x

We create PyTorch tensors

In [None]:
X_train_scaled = Variable(torch.Tensor(X_train_scaled).float())
y_train = Variable(torch.Tensor(y_train).float())

We create the model

In [None]:
input_dim = X_train_scaled.shape[1]
model = Model(input_dim)
loss_fn = nn.BCELoss()# binary cross entropy loss
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

We fit the model with a batch size of one item

In [None]:
model.train()
for epoch in range(50):
    for x_train_scaled, y_t in zip(X_train_scaled, y_train):
        y_train_pred = model(X_train_scaled)
        loss = loss_fn(y_train_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

### The weights

In [None]:
list(model.parameters())

## We evaluate the model

We compute the probabilities to belong to class 1 for all the training set

In [None]:
model.eval()
predicted_probs = model(X_train_scaled)
predicted_probs

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
def predict_class(preds):
    c = []
    for x in range(len(preds)):
        if(preds[x] >= 0.5):
            c += [1]
        else:
            c += [0]
    return np.array(c)

In [None]:
classes = predict_class(predicted_probs)
classes

In [None]:
accuracy_score(y_train, classes)

We computed the accuracy from the training set. This is not a good practice. We should use a dedicated test set instead.