In [221]:
from torch import FloatTensor, LongTensor, Tensor
import math
from tqdm import tqdm, tqdm_notebook

In [50]:
class Module(object):
    
    def forward(self, *input_):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []

## ReLU Module

ReLU function: 
\begin{equation}
f(x) = max(0, x)
\end{equation}

the derivative of ReLU is

\begin{equation} 
f'(x)=
    \begin{cases}
      1, & \text{if}\ x>0 \\
      0, & \text{otherwise}
    \end{cases}
\end{equation}

In [350]:
class ReLU(Module):
    def __init__(self):
        self.z = None
    
    def forward(self, input_):
        self.z = input_.clone()
        input_[input_ < 0] = 0
        return input_
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        tensor = self.z.clone()
        # g'(z)
        tensor[tensor > 0] = 1
        tensor[tensor < 0] = 0
        # dz[l]
        return da.mul(tensor)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

## Tanh Module

In [340]:
class Tanh(Module):   
    def __init__(self):
        self.z = None
    
    def forward(self, input_):
        self.z = input_
        return input_.tanh()
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        # g'(z)
        g_prime = (1 - self.z.tanh().pow(2))
        # dz[l]
        return da.mul(g_prime)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

## Linear Module
fully connected layer

In [502]:
class Linear(Module):   
    def __init__(self, in_dim, out_dim):        
        self.w = Tensor(out_dim,in_dim).normal_(0)
        self.b = Tensor(out_dim,1).normal_(0)
        self.x_previous_layer = None
        # sum the gradient wrt w / b for each batch in these variables
        self.grad_w_sum = Tensor(self.w.size()).zero_()
        self.grad_b_sum = Tensor(self.b.size()).zero_()
    
    def forward(self, input_):
        self.x_previous_layer = input_
        return (self.w.mm(input_.t()) + self.b).t()
        
    def backward(self, gradwrtoutput):
        dz = gradwrtoutput.t()
        dw = dz.mm(self.x_previous_layer)
        db = dz
        # sum the gradients for the weights and biases
        self.grad_w_sum += dw
        self.grad_b_sum += db.sum(1).unsqueeze(1)
        return (self.w.t().mm(dz)).t()
        
    def param(self):
        return [ (self.w, self.grad_w_sum), (self.b, self.grad_b_sum) ]
    
    def zero_grad(self):
        self.grad_w_sum.zero_()
        self.grad_b_sum.zero_()

## Sequential Module
to combine several modules in basic sequential structure

In [327]:
class Sequential(Module):    
    def __init__(self, *layers_):
        self.modules = layers_
        
    def forward(self, input_):
        x = input_
        for module in self.modules:
            x = module.forward(x)
        return x
        
    def backward(self, gradwrtoutput):
        x = gradwrtoutput
        for module in reversed(self.modules):
            x = module.backward(x)
        return x
        
    def param(self):
        return [ p for module in self.modules for p in module.param() ]
    
    def zero_grad(self):
        for module in self.modules:
            module.zero_grad()

## MSE Loss Function

In [101]:
class LossMSE(Module): 
    def __init__(self):
        self.error = None
        
    def forward(self, preds, labels):
        self.error = preds - labels
        return self.error.pow(2).sum()
        
    def backward(self):
        return 2 * self.error
        
    def param(self):
        return []

## SGD optimization

In [465]:
class optim_SGD(Module): 
    def __init__(self, parameters, learning_rate):
        self.param = parameters #[ p.shallow() for tup in parameters for p in tup ]
        self.lr = learning_rate
        
    def step(self):
        for (p, grad_p) in self.param:
            p.sub_(self.lr*grad_p)

### testing

In [494]:
target = FloatTensor(5, 4).normal_(0,5)

In [497]:
our_model = Sequential(Linear(2,10), ReLU(), Linear(10,4), ReLU())

our_loss = LossMSE()

our_optim = optim_SGD(our_model.param(), 0.01)

output = our_model.forward(a)

our_loss.forward(output, target)

our_model.backward(our_loss.backward())

#our_optim.step()


   1.8532   10.2142
-141.7424  153.1749
 -64.6264   69.8389
   0.0000    0.0000
-164.5195   86.1878
[torch.FloatTensor of size 5x2]

## Test file

In [534]:
def generate_disc_set(nb):
    a = Tensor(nb, 2).uniform_(0, 1)
    target = (a.pow(2).sum(1) < (2/math.pi)).long()
    return a, target

train_input, train_target = generate_disc_set(1000)
test_input, test_target = generate_disc_set(1000)

In [535]:
train_target.sum()

520

In [526]:
def convert_to_one_hot(target):
    tmp = FloatTensor(target.size(0), 2).fill_(0)
    for k in range(0, target.size(0)):
        tmp[k, train_target[k]] = 1
    return tmp

In [536]:
train_one_hot_target = convert_to_one_hot(train_target)
test_one_hot_target = convert_to_one_hot(test_target)

In [556]:
model = Sequential(Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25,2), Tanh())

criterion = LossMSE()
optimizer = optim_SGD(model.param(), 1e-3)
nb_epochs = 100
mini_batch_size = 5

for e in range(0, nb_epochs):
    loss = 0
    for b in range(0, train_input.size(0), mini_batch_size):
        output = model.forward(train_input.narrow(0, b, mini_batch_size))
        loss += criterion.forward(output, train_one_hot_target.narrow(0, b, mini_batch_size))
        model.zero_grad()
        model.backward(criterion.backward())
        optimizer.step()
    print("Epoch", e+1, ":", loss)

Epoch 1 : 1232.1206424680252
Epoch 2 : 395.00490848482826
Epoch 3 : 284.54823530106984
Epoch 4 : 247.88485170867702
Epoch 5 : 214.811980666586
Epoch 6 : 176.07135696391555
Epoch 7 : 139.64829893080648
Epoch 8 : 125.56326954292145
Epoch 9 : 118.1748435563116
Epoch 10 : 112.99839875409816
Epoch 11 : 108.92823118393753
Epoch 12 : 105.49558437747082
Epoch 13 : 102.4633725938744
Epoch 14 : 99.70230613186138
Epoch 15 : 97.13978662415683
Epoch 16 : 94.73573195138749
Epoch 17 : 92.46955646511796
Epoch 18 : 90.33204806339903
Epoch 19 : 88.31973954808043
Epoch 20 : 86.43102904268
Epoch 21 : 84.6639106038726
Epoch 22 : 83.01499246640608
Epoch 23 : 81.47941062448085
Epoch 24 : 80.05120698120467
Epoch 25 : 78.72373668487606
Epoch 26 : 77.49004642496656
Epoch 27 : 76.34335888649491
Epoch 28 : 75.27715828406097
Epoch 29 : 74.285409241054
Epoch 30 : 73.3625733547268
Epoch 31 : 72.50355743472016
Epoch 32 : 71.70369856836193
Epoch 33 : 70.95859975262792
Epoch 34 : 70.26412210405859
Epoch 35 : 69.6163322

In [557]:
nb_data_errors = 0
output = model.forward(test_input)

_, predicted_classes = output.max(1)
for k in range(test_input.size(0)):
    if test_target[k] != predicted_classes[k]:
        nb_data_errors = nb_data_errors + 1

In [558]:
print("Test accuracy :", 100*(1-nb_data_errors/test_input.size(0)), "%")

Test accuracy : 97.7 %
