In [1]:
from torch import FloatTensor, LongTensor, Tensor
import math

## Module Superclass

In [2]:
class Module(object):
    def forward(self, *input_):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []

## ReLU Module

ReLU function: 
\begin{equation}
f(x) = max(0, x)
\end{equation}

the derivative of ReLU is

\begin{equation} 
f'(x)=
    \begin{cases}
      1, & \text{if}\ x>0 \\
      0, & \text{otherwise}
    \end{cases}
\end{equation}

In [3]:
# This module represents the ReLU activation function
class ReLU(Module):
    def __init__(self):
        self.z = None
    
    # input_: the tensor outputed by the current layer
    def forward(self, input_):
        self.z = input_.clone()
        input_[input_ < 0] = 0
        return input_
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        tensor = self.z.clone()
        # g'(z)
        tensor[tensor > 0] = 1
        tensor[tensor < 0] = 0
        # dz[l]
        return da.mul(tensor)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

## Tanh Module

In [4]:
class Tanh(Module):   
    def __init__(self):
        self.z = None
    
    # input_: the tensor outputed by the current layer
    def forward(self, input_):
        self.z = input_
        return input_.tanh()
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        # g'(z)
        g_prime = (1 - self.z.tanh().pow(2))
        # dz[l]
        return da.mul(g_prime)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

## Linear Module
fully connected layer

In [5]:
class Linear(Module):   
    def __init__(self, in_dim, out_dim):
        # keep track of the weigths, the biases and the output of the previous layer's activation function
        self.x_previous_layer = None
        # intialize the weights and biases with a normal(0,1) distribution
        self.w = Tensor(out_dim,in_dim).normal_()
        self.b = Tensor(out_dim,1).normal_()
        # init the gradient of the loss wrt w / b
        self.grad_w_sum = Tensor(self.w.size()).zero_()
        self.grad_b_sum = Tensor(self.b.size()).zero_()
    
    # input_: the output of the previous layer's activation function
    def forward(self, input_):
        self.x_previous_layer = input_
        return (self.w.mm(input_.t()) + self.b).t()
        
    def backward(self, gradwrtoutput):
        dz = gradwrtoutput.t()
        dw = dz.mm(self.x_previous_layer)
        db = dz
        # sum the gradients for the weights and biases
        self.grad_w_sum += dw
        self.grad_b_sum += db.sum(1).unsqueeze(1)
        return (self.w.t().mm(dz)).t()
        
    # returns a list of pairs, each composed of a parameter tensor and a gradient tensor
    # parameters: weights and biases
    def param(self):
        return [ (self.w, self.grad_w_sum), (self.b, self.grad_b_sum) ]
    
    def zero_grad(self):
        self.grad_w_sum.zero_()
        self.grad_b_sum.zero_()

## Sequential Module
to combine several modules in basic sequential structure

In [6]:
# This module allows to combine several modules (layers, activation functions) in a basic sequential structure
class Sequential(Module):    
    def __init__(self, *layers_):
        self.modules = layers_
        
    # input_: the input data is a minibatch whose columns are features and lines are samples
    def forward(self, input_):
        x = input_
        for module in self.modules:
            x = module.forward(x)
        return x
        
    def backward(self, gradwrtoutput):
        x = gradwrtoutput
        for module in reversed(self.modules):
            x = module.backward(x)
        return x
        
    # returns a flatened list of each module's parameters
    # each parameter in the list is represented as a tuple containing the parameter tensor (e.g. w)
    # and the gradient tensor (e.g. dl/dw)
    def param(self):
        return [ p for module in self.modules for p in module.param() ]
    
    # s,
    def zero_grad(self):
        for module in self.modules:
            module.zero_grad()

## MSE Loss Function

In [7]:
class LossMSE(Module): 
    def __init__(self):
        self.error = None
        
    def forward(self, preds, labels):
        self.error = preds - labels
        return self.error.pow(2).sum()
        
    def backward(self):
        return 2 * self.error
        
    def param(self):
        return []

## Cross-entropy Loss Function

In [8]:
class LossCrossEntropy(Module): 
    def __init__(self):
        self.p = None
        self.c = None
        
    # stabilized version of softmax: convert each prediction to a probability
    # to avoid nans due to big exponentials
    def softmax(self, t):
        stable_exp = (t - t.max(1)[0].unsqueeze(1)).exp()
        return stable_exp / (stable_exp.sum(1).unsqueeze(1))
        
    def forward(self, preds, labels):
        self.p = self.softmax(preds)
        self.c = labels
        return -(self.c * self.p.log()).sum()
        
    def backward(self):
        return(self.p-self.c)
        
    def param(self):
        return []

## SGD optimization

In [9]:
class optim_SGD(Module):
    # parameters: the parameters of the Sequential module
    def __init__(self, parameters, learning_rate):
        self.param = parameters #[ p.shallow() for tup in parameters for p in tup ]
        self.lr = learning_rate
        
    # performs a gradient step (SGD) for all parameters
    def step(self):
        for (p, grad_p) in self.param:
            p.sub_(self.lr*grad_p)

## Helpers

In [10]:
def generate_disc_set(nb):
    a = Tensor(nb, 2).uniform_(0, 1)
    target = (((a-0.5).pow(2).sum(1)).sqrt() < math.sqrt(1/(2*math.pi))).long()
    return a, target

In [11]:
# converts 'target' Tensor to one hot labels
def convert_to_one_hot(target):
    tmp = FloatTensor(target.size(0), 2).fill_(0)
    for k in range(0, target.size(0)):
        tmp[k, target[k]] = 1
    return tmp

In [12]:
def compute_accuracy(model, input_, target):
    nb_data_errors = 0
    output = model.forward(input_)

    _, predicted_classes = output.max(1)
    for k in range(input_.size(0)):
        if target[k] != predicted_classes[k]:
            nb_data_errors = nb_data_errors + 1
    return 100 - (100*(nb_data_errors / input_.size(0)))

In [13]:
def train_model(train_input, train_one_hot_target, model, criterion, optimizer, nb_epochs=100, mini_batch_size=5, verbose=False):
    for e in range(0, nb_epochs):
        loss = 0
        for b in range(0, train_input.size(0), mini_batch_size):
            output = model.forward(train_input.narrow(0, b, mini_batch_size))
            # sum the loss for each batch to get the current epoch's loss
            loss += criterion.forward(output, train_one_hot_target.narrow(0, b, mini_batch_size))
            # set the gradients of all layers to zero before the next batch can go through the network
            model.zero_grad()
            model.backward(criterion.backward())
            optimizer.step() # performs a gradient step to optimize the parameters
        if verbose:
            print("Epoch", e+1, ":", loss)

## Test file

In [14]:
# generate training and testing data
train_input, train_target = generate_disc_set(1000)
test_input, test_target = generate_disc_set(1000)

# convert targets to one hot labels
train_one_hot_target = convert_to_one_hot(train_target)
test_one_hot_target = convert_to_one_hot(test_target)

In [15]:
train_target.sum()

506

In [16]:
nb_epochs = 100
mini_batch_size = 5
model = Sequential(Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25, 25), Tanh(), Linear(25,2))
#criterion = LossMSE()
criterion = LossCrossEntropy()
optimizer = optim_SGD(model.param(), 1e-3)

train_model(train_input, train_one_hot_target, model, criterion, optimizer, nb_epochs, mini_batch_size, verbose=True)

print("Train accuracy :", compute_accuracy(model, train_input, train_target), "%")
print("Test accuracy :", compute_accuracy(model, test_input, test_target), "%")

Epoch 1 : 758.0199315973441
Epoch 2 : 296.6092055734043
Epoch 3 : 197.67283817746466
Epoch 4 : 167.65694955298204
Epoch 5 : 151.76528120001421
Epoch 6 : 141.9750021950149
Epoch 7 : 134.8398970694543
Epoch 8 : 129.22230606994526
Epoch 9 : 124.60645500844294
Epoch 10 : 120.69821669754856
Epoch 11 : 117.31153733971951
Epoch 12 : 114.31890583731274
Epoch 13 : 111.62717726784194
Epoch 14 : 109.16543072008972
Epoch 15 : 106.88049628996032
Epoch 16 : 104.72981941302533
Epoch 17 : 102.67470442627018
Epoch 18 : 100.68407852192877
Epoch 19 : 98.74729880184759
Epoch 20 : 96.8753701215528
Epoch 21 : 95.08625340387152
Epoch 22 : 93.39407917066046
Epoch 23 : 91.80450126241408
Epoch 24 : 90.3168570939431
Epoch 25 : 88.92560970754334
Epoch 26 : 87.62321039209928
Epoch 27 : 86.4016387773047
Epoch 28 : 85.2526659193395
Epoch 29 : 84.16924408279283
Epoch 30 : 83.1448877273167
Epoch 31 : 82.17393467319357
Epoch 32 : 81.25141862140447
Epoch 33 : 80.37291809653073
Epoch 34 : 79.53478716464531
Epoch 35 : 78.

In [18]:
n_iters = 50
test_acc=[]
train_acc=[]
for i in range(n_iters):
    print('iter', i+1)
    model = Sequential(Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25, 25), Tanh(), Linear(25,2))
    criterion = LossMSE()
    optimizer = optim_SGD(model.param(), 1e-3)
    
    train_model(train_input, train_one_hot_target, model, criterion, optimizer, nb_epochs, mini_batch_size)

    train_a = compute_accuracy(model, train_input, train_target)
    test_a = compute_accuracy(model, test_input, test_target)
    print("Train accuracy :", train_a, "%")
    print("Test accuracy :", test_a, "%")
    train_acc.append(train_a)
    test_acc.append(test_a)

iter 1
Train accuracy : 97.6 %
Test accuracy : 95.6 %
iter 2
Train accuracy : 96.1 %
Test accuracy : 94.6 %
iter 3
Train accuracy : 97.8 %
Test accuracy : 95.3 %
iter 4
Train accuracy : 98.0 %
Test accuracy : 96.5 %
iter 5
Train accuracy : 97.7 %
Test accuracy : 96.9 %
iter 6
Train accuracy : 96.6 %
Test accuracy : 96.1 %
iter 7
Train accuracy : 97.6 %
Test accuracy : 96.8 %
iter 8
Train accuracy : 98.0 %
Test accuracy : 96.8 %
iter 9
Train accuracy : 97.0 %
Test accuracy : 95.6 %
iter 10
Train accuracy : 97.1 %
Test accuracy : 96.8 %
iter 11
Train accuracy : 97.7 %
Test accuracy : 96.1 %
iter 12
Train accuracy : 97.9 %
Test accuracy : 97.1 %
iter 13
Train accuracy : 97.9 %
Test accuracy : 96.8 %
iter 14
Train accuracy : 96.3 %
Test accuracy : 95.2 %
iter 15
Train accuracy : 96.8 %
Test accuracy : 95.0 %
iter 16
Train accuracy : 97.8 %
Test accuracy : 96.8 %
iter 17
Train accuracy : 96.6 %
Test accuracy : 95.8 %
iter 18
Train accuracy : 95.6 %
Test accuracy : 94.5 %
iter 19
Train accur

In [19]:
print('train accuracy', sum(train_acc)/len(train_acc))
print('test accuracy', sum(test_acc)/len(test_acc))

train accuracy 97.27
test accuracy 96.15799999999999
