In [5]:
from torch import FloatTensor, LongTensor, Tensor
import math

## Module Superclass

In [6]:
class Module(object):
    def forward(self, *input_):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []

## ReLU Module

ReLU function: 
\begin{equation}
f(x) = max(0, x)
\end{equation}

the derivative of ReLU is

\begin{equation} 
f'(x)=
    \begin{cases}
      1, & \text{if}\ x>0 \\
      0, & \text{otherwise}
    \end{cases}
\end{equation}

In [7]:
# This module represents the ReLU activation function
class ReLU(Module):
    def __init__(self):
        self.z = None
    
    # input_: the tensor outputed by the current layer
    def forward(self, input_):
        self.z = input_.clone()
        input_[input_ < 0] = 0
        return input_
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        tensor = self.z.clone()
        # g'(z)
        tensor[tensor > 0] = 1
        tensor[tensor < 0] = 0
        # dz[l]
        return da.mul(tensor)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

## Tanh Module

In [8]:
class Tanh(Module):   
    def __init__(self):
        self.z = None
    
    # input_: the tensor outputed by the current layer
    def forward(self, input_):
        self.z = input_
        return input_.tanh()
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        # g'(z)
        g_prime = (1 - self.z.tanh().pow(2))
        # dz[l]
        return da.mul(g_prime)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

## Linear Module
fully connected layer

In [84]:
class Linear(Module):   
    def __init__(self, in_dim, out_dim):
        # keep track of the weigths, the biases and the output of the previous layer's activation function
        self.x_previous_layer = None
        # intialize the weights and biases with a normal(0,1) distribution
        self.w = Tensor(out_dim,in_dim).normal_()
        self.b = Tensor(out_dim,1).normal_()
        # init the gradient of the loss wrt w / b
        self.grad_w_sum = Tensor(self.w.size()).zero_()
        self.grad_b_sum = Tensor(self.b.size()).zero_()
    
    # input_: the output of the previous layer's activation function
    def forward(self, input_):
        self.x_previous_layer = input_
        return (self.w.mm(input_.t()) + self.b).t()
        
    def backward(self, gradwrtoutput):
        dz = gradwrtoutput.t()
        dw = dz.mm(self.x_previous_layer)
        db = dz
        # sum the gradients for the weights and biases
        self.grad_w_sum += dw
        self.grad_b_sum += db.sum(1).unsqueeze(1)
        return (self.w.t().mm(dz)).t()
        
    # returns a list of pairs, each composed of a parameter tensor and a gradient tensor
    # parameters: weights and biases
    def param(self):
        return [ (self.w, self.grad_w_sum), (self.b, self.grad_b_sum) ]
    
    def zero_grad(self):
        self.grad_w_sum.zero_()
        self.grad_b_sum.zero_()

## Sequential Module
to combine several modules in basic sequential structure

In [85]:
# This module allows to combine several modules (layers, activation functions) in a basic sequential structure
class Sequential(Module):    
    def __init__(self, *layers_):
        self.modules = layers_
        
    # input_: the input data is a minibatch whose columns are features and lines are samples
    def forward(self, input_):
        x = input_
        for module in self.modules:
            x = module.forward(x)
        return x
        
    def backward(self, gradwrtoutput):
        x = gradwrtoutput
        for module in reversed(self.modules):
            x = module.backward(x)
        return x
        
    # returns a flatened list of each module's parameters
    # each parameter in the list is represented as a tuple containing the parameter tensor (e.g. w)
    # and the gradient tensor (e.g. dl/dw)
    def param(self):
        return [ p for module in self.modules for p in module.param() ]
    
    # s,
    def zero_grad(self):
        for module in self.modules:
            module.zero_grad()

## MSE Loss Function

In [86]:
class LossMSE(Module): 
    def __init__(self):
        self.error = None
        
    def forward(self, preds, labels):
        self.error = preds - labels
        return self.error.pow(2).sum()
        
    def backward(self):
        return 2 * self.error
        
    def param(self):
        return []

## Cross-entropy Loss Function

In [87]:
class LossCrossEntropy(Module): 
    def __init__(self):
        self.p = None
        self.c = None
        
    # stabilized version of softmax: convert each prediction to a probability
    # to avoid nans due to big exponentials
    def softmax(self, t):
        stable_exp = (t - t.max(1)[0].unsqueeze(1)).exp()
        return stable_exp / (stable_exp.sum(1).unsqueeze(1))
        
    def forward(self, preds, labels):
        self.p = self.softmax(preds)
        self.c = labels
        return -(self.c * self.p.log()).sum()
        
    def backward(self):
        return(self.p-self.c)
        
    def param(self):
        return []

## SGD optimization

In [88]:
class optim_SGD(Module):
    # parameters: the parameters of the Sequential module
    def __init__(self, parameters, learning_rate):
        self.param = parameters #[ p.shallow() for tup in parameters for p in tup ]
        self.lr = learning_rate
        
    # performs a gradient step (SGD) for all parameters
    def step(self):
        for (p, grad_p) in self.param:
            p.sub_(self.lr*grad_p)

## Helpers

In [89]:
def generate_disc_set(nb):
    a = Tensor(nb, 2).uniform_(0, 1)
    target = (((a-0.5).pow(2).sum(1)).sqrt() < math.sqrt(1/(2*math.pi))).long()
    return a, target

In [90]:
# converts 'target' Tensor to one hot labels
def convert_to_one_hot(target):
    tmp = FloatTensor(target.size(0), 2).fill_(0)
    for k in range(0, target.size(0)):
        tmp[k, target[k]] = 1
    return tmp

In [91]:
def compute_accuracy(model, input_, target):
    nb_data_errors = 0
    output = model.forward(input_)

    _, predicted_classes = output.max(1)
    for k in range(input_.size(0)):
        if target[k] != predicted_classes[k]:
            nb_data_errors = nb_data_errors + 1
    return 100 - (100*(nb_data_errors / input_.size(0)))

In [92]:
def train_model(train_input, train_one_hot_target, model, criterion, optimizer, nb_epochs=100, mini_batch_size=5, verbose=False):
    for e in range(0, nb_epochs):
        loss = 0
        for b in range(0, train_input.size(0), mini_batch_size):
            output = model.forward(train_input.narrow(0, b, mini_batch_size))
            # sum the loss for each batch to get the current epoch's loss
            loss += criterion.forward(output, train_one_hot_target.narrow(0, b, mini_batch_size))
            # set the gradients of all layers to zero before the next batch can go through the network
            model.zero_grad()
            model.backward(criterion.backward())
            optimizer.step() # performs a gradient step to optimize the parameters
        if verbose:
            print("Epoch", e+1, ":", loss)

## Test file

In [93]:
# generate training and testing data
train_input, train_target = generate_disc_set(1000)
test_input, test_target = generate_disc_set(1000)

# convert targets to one hot labels
train_one_hot_target = convert_to_one_hot(train_target)
test_one_hot_target = convert_to_one_hot(test_target)

In [94]:
nb_epochs = 100
mini_batch_size = 5
model = Sequential(Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25, 25), ReLU(), Linear(25,2))
criterion = LossMSE()
#criterion = LossCrossEntropy()
optimizer = optim_SGD(model.param(), 1e-3)

train_model(train_input, train_one_hot_target, model, criterion, optimizer, nb_epochs, mini_batch_size, verbose=True)

print("Train accuracy :", compute_accuracy(model, train_input, train_target), "%")
print("Test accuracy :", compute_accuracy(model, test_input, test_target), "%")

Epoch 1 : 13257.73107166785
Epoch 2 : 187.43299336306399
Epoch 3 : 180.4349893641963
Epoch 4 : 175.26777482688135
Epoch 5 : 168.27365114174822
Epoch 6 : 157.74529368249625
Epoch 7 : 139.19655357947505
Epoch 8 : 114.70104860879302
Epoch 9 : 95.27230610372244
Epoch 10 : 79.87113861192488
Epoch 11 : 72.27826486864699
Epoch 12 : 67.95747287673457
Epoch 13 : 64.71159408829863
Epoch 14 : 62.06390336946616
Epoch 15 : 59.6508522550436
Epoch 16 : 57.50170749957235
Epoch 17 : 55.41991055531961
Epoch 18 : 53.45476692533729
Epoch 19 : 51.715203675003934
Epoch 20 : 50.124037997089886
Epoch 21 : 48.69843302650733
Epoch 22 : 47.214209328893254
Epoch 23 : 46.06037787757636
Epoch 24 : 45.07272486671839
Epoch 25 : 44.15307514667424
Epoch 26 : 43.3029090386791
Epoch 27 : 42.48654155035658
Epoch 28 : 41.8322036135676
Epoch 29 : 41.12969682220667
Epoch 30 : 40.48452206207522
Epoch 31 : 39.900702329054035
Epoch 32 : 39.339075614669675
Epoch 33 : 38.8405862264273
Epoch 34 : 38.3653055756281
Epoch 35 : 37.904

In [95]:
n_iters = 10
test_acc=[]
train_acc=[]
for i in range(n_iters):
    print('iter', i+1)
    model = Sequential(Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25, 25), Tanh(), Linear(25,2))
    criterion = LossMSE()
    #criterion = LossCrossEntropy()
    optimizer = optim_SGD(model.param(), 1e-3)
    
    train_model(train_input, train_one_hot_target, model, criterion, optimizer, nb_epochs, mini_batch_size)

    train_a = compute_accuracy(model, train_input, train_target)
    test_a = compute_accuracy(model, test_input, test_target)
    print("Train accuracy :", train_a, "%")
    print("Test accuracy :", test_a, "%")
    train_acc.append(train_a)
    test_acc.append(test_a)

iter 1
Train accuracy : 98.3 %
Test accuracy : 99.0 %
iter 2
Train accuracy : 98.8 %
Test accuracy : 98.8 %
iter 3
Train accuracy : 98.0 %
Test accuracy : 98.5 %
iter 4
Train accuracy : 98.8 %
Test accuracy : 98.6 %
iter 5
Train accuracy : 96.9 %
Test accuracy : 97.3 %
iter 6
Train accuracy : 98.5 %
Test accuracy : 98.9 %
iter 7
Train accuracy : 97.8 %
Test accuracy : 98.1 %
iter 8
Train accuracy : 99.0 %
Test accuracy : 99.0 %
iter 9
Train accuracy : 98.6 %
Test accuracy : 99.2 %
iter 10
Train accuracy : 98.6 %
Test accuracy : 98.8 %


In [22]:
print('train accuracy', sum(train_acc)/len(train_acc))
print('test accuracy', sum(test_acc)/len(test_acc))

train accuracy 99.014
test accuracy 98.94199999999995
