In [221]:
from torch import FloatTensor, LongTensor, Tensor
import math
from tqdm import tqdm, tqdm_notebook

In [50]:
class Module(object):
    
    def forward(self, *input_):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []

### ReLU Module

ReLU function: 
\begin{equation}
f(x) = max(0, x)
\end{equation}

the derivative of ReLU is

\begin{equation} 
f'(x)=
    \begin{cases}
      1, & \text{if}\ x>0 \\
      0, & \text{otherwise}
    \end{cases}
\end{equation}

In [350]:
class ReLU(Module):
    def __init__(self):
        self.z = None
    
    def forward(self, input_):
        self.z = input_.clone()
        input_[input_ < 0] = 0
        return input_
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        tensor = self.z.clone()
        # g'(z)
        tensor[tensor > 0] = 1
        tensor[tensor < 0] = 0
        # dz[l]
        return da.mul(tensor)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

### Tanh Module

In [340]:
class Tanh(Module):   
    def __init__(self):
        self.z = None
    
    def forward(self, input_):
        self.z = input_
        return input_.tanh()
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        # g'(z)
        g_prime = (1 - self.z.tanh().pow(2))
        # dz[l]
        return da.mul(g_prime)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

### Linear Module
fully connected layer

In [356]:
class Linear(Module):   
    def __init__(self, in_dim, out_dim):        
        self.w = Tensor(out_dim,in_dim).normal_(0)
        self.b = Tensor(out_dim,1).normal_(0)
        self.x_previous_layer = None
        # sum the gradient wrt w / b for each batch in these variables
        self.grad_w_sum = Tensor(self.w.size()).zero_()
        self.grad_b_sum = Tensor(self.b.size()).zero_()
    
    def forward(self, input_):
        self.x_previous_layer = input_
        return (self.w.mm(input_.t()) + self.b).t()
        
    def backward(self, gradwrtoutput):
        dz = gradwrtoutput.t()
        dw = dz.mm(self.x_previous_layer)
        db = dz
        # sum the gradients for the weights and biases
        self.grad_w_sum = dw
        self.grad_b_sum = db.sum(1)
        return (self.w.t().mm(dz)).t()
        
    def param(self):
        return [ (self.w, self.grad_w_sum), (self.b, self.grad_b_sum) ]
    
    def zero_grad(self):
        self.grad_w_sum.zero_()
        self.grad_b_sum.zero_()

### Sequential Module
to combine several modules in basic sequential structure

In [327]:
class Sequential(Module):    
    def __init__(self, *layers_):
        self.modules = layers_
        
    def forward(self, input_):
        x = input_
        for module in self.modules:
            x = module.forward(x)
        return x
        
    def backward(self, gradwrtoutput):
        x = gradwrtoutput
        for module in reversed(self.modules):
            x = module.backward(x)
        return x
        
    def param(self):
        return [ p for module in self.modules for p in module.param() ]
    
    def zero_grad(self):
        for module in self.modules:
            module.zero_grad()

### MSE Loss Function

In [101]:
class LossMSE(Module): 
    def __init__(self):
        self.error = None
        
    def forward(self, preds, labels):
        self.error = preds - labels
        return self.error.pow(2).sum()
        
    def backward(self):
        return 2 * self.error
        
    def param(self):
        return []

In [353]:
class optim_SGD(Module): 
    def __init__(self, parameters, learning_rate):
        self.param = parameters
        self.lr = learning_rate
        
    def step(self):
        #print(self.param)
        for (p, grad_p) in self.param:
            p -= self.lr*grad_p
        #print(self.param)

### testing

In [267]:
a = FloatTensor(5, 2)
a.normal_(0, 5)
b = FloatTensor(5, 2)
b.normal_(0, 5)


  5.9373   7.2770
 -7.0436   2.5153
  7.2012   3.9569
 18.7668  -2.0585
  3.5095   5.1504
[torch.FloatTensor of size 5x2]

In [73]:
print(a)
print(b)


 -9.4639 -11.7929
 -3.3858 -14.1017
  3.7349   1.5512
  4.7306  -0.6805
  1.5711  -0.4694
[torch.FloatTensor of size 5x2]


-3.9147  5.9485
-4.1990 -1.6456
 8.7344 -7.4869
 2.4498 -8.1163
-0.4472  5.3993
[torch.FloatTensor of size 5x2]



In [291]:
a.sum(1)


  2.4001
 -7.2299
  5.4902
  6.9134
 10.6199
[torch.FloatTensor of size 5]

In [318]:
a.pow(2).sum()

260.3018182516098

In [324]:
target = FloatTensor(5, 4).normal_(0,5)

In [355]:
our_model = Sequential(Linear(2,10), ReLU(), Linear(10,4), ReLU())

our_loss = LossMSE()

our_optim = optim_SGD(our_model.param(), 0.01)

output = our_model.forward(a)

our_loss.forward(output, target)

our_model.backward(our_loss.backward())

our_optim.step()

[(
-0.9394  0.2829
-0.9570 -0.9508
 0.0185 -0.0531
-0.5339 -0.9712
 1.0330 -0.9913
 0.5648  1.1601
 2.0306  0.6977
-1.5693  1.2258
-0.1933 -0.0247
 0.3024  1.1010
[torch.FloatTensor of size 10x2]
, 
    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
[torch.FloatTensor of size 10x2]
), (
 0.1473
-0.9681
-2.4477
-1.2904
 0.2527
 0.1553
 0.4330
-0.5547
-0.7481
-0.1981
[torch.FloatTensor of size 10x1]
, 
    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
[torch.FloatTensor of size 10x1]
), (
 0.0213 -0.3914  1.2891  0.2215  0.4615  0.7060 -0.7218  1.5412  0.9724  1.3990
 0.5743 -0.1761  0.3649  0.7016 -0.6176  0.2110 -0.5071  1.8456  1.7542 -0.0271
-0.1300 -0.9722 -0.2770 -0.0351 -0.2841 -0.5783 -1.8363 -0.5716  1.4358 -1.8119
 0.2769  0.7367  0.2189  1.1120  1.2765 -1.4958 -0.4148 -0.9925 -0.2688 -0.9220
[torch.FloatTensor of size 4x10]
, 
    0     0     0     0     0     0     0     0     0     0
    0     0  

## Test file

In [233]:
def generate_disc_set(nb):
    a = Tensor(nb, 2).uniform_(0, 1)
    target = (a.pow(2).sum(1) < (2/math.pi)).long()
    return a, target

train_input, train_target = generate_disc_set(1000)
test_input, test_target = generate_disc_set(1000)

In [238]:
def convert_to_one_hot(target):
    tmp = FloatTensor(target.size(0), 2).fill_(0)
    for k in range(0, target.size(0)):
        tmp[k, train_target[k]] = 1
    return tmp

In [243]:
train_one_hot_target = convert_to_one_hot(train_target)
test_one_hot_target = convert_to_one_hot(test_target)

In [347]:
model = Sequential(Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25,2), Tanh())

criterion = LossMSE()
optimizer = optim_SGD(model.param(), 1e-1)
nb_epochs = 50
mini_batch_size = 100

for e in tqdm(range(0, nb_epochs)):
    loss = 0
    for b in range(0, train_input.size(0), mini_batch_size):
        output = model.forward(train_input.narrow(0, b, mini_batch_size))
        loss += criterion.forward(output, train_one_hot_target.narrow(0, b, mini_batch_size))
        model.zero_grad()
        model.backward(criterion.backward())
        optimizer.step()
    print("Epoch", e+1, ":", loss)

100%|██████████| 50/50 [00:00<00:00, 259.02it/s]

Epoch 1 : 1208.3341082928696
Epoch 2 : 1208.3341082928696
Epoch 3 : 1208.3341082928696
Epoch 4 : 1208.3341082928696
Epoch 5 : 1208.3341082928696
Epoch 6 : 1208.3341082928696
Epoch 7 : 1208.3341082928696
Epoch 8 : 1208.3341082928696
Epoch 9 : 1208.3341082928696
Epoch 10 : 1208.3341082928696
Epoch 11 : 1208.3341082928696
Epoch 12 : 1208.3341082928696
Epoch 13 : 1208.3341082928696
Epoch 14 : 1208.3341082928696
Epoch 15 : 1208.3341082928696
Epoch 16 : 1208.3341082928696
Epoch 17 : 1208.3341082928696
Epoch 18 : 1208.3341082928696
Epoch 19 : 1208.3341082928696
Epoch 20 : 1208.3341082928696
Epoch 21 : 1208.3341082928696
Epoch 22 : 1208.3341082928696
Epoch 23 : 1208.3341082928696
Epoch 24 : 1208.3341082928696
Epoch 25 : 1208.3341082928696
Epoch 26 : 1208.3341082928696
Epoch 27 : 1208.3341082928696
Epoch 28 : 1208.3341082928696
Epoch 29 : 1208.3341082928696
Epoch 30 : 1208.3341082928696
Epoch 31 : 1208.3341082928696
Epoch 32 : 1208.3341082928696
Epoch 33 : 1208.3341082928696
Epoch 34 : 1208.334


