In [221]:
from torch import FloatTensor, LongTensor, Tensor
import math
from tqdm import tqdm, tqdm_notebook

In [50]:
class Module(object):
    
    def forward(self, *input_):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []

### ReLU Module

ReLU function: 
\begin{equation}
f(x) = max(0, x)
\end{equation}

the derivative of ReLU is

\begin{equation} 
f'(x)=
    \begin{cases}
      1, & \text{if}\ x>0 \\
      0, & \text{otherwise}
    \end{cases}
\end{equation}

In [350]:
class ReLU(Module):
    def __init__(self):
        self.z = None
    
    def forward(self, input_):
        self.z = input_.clone()
        input_[input_ < 0] = 0
        return input_
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        tensor = self.z.clone()
        # g'(z)
        tensor[tensor > 0] = 1
        tensor[tensor < 0] = 0
        # dz[l]
        return da.mul(tensor)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

### Tanh Module

In [340]:
class Tanh(Module):   
    def __init__(self):
        self.z = None
    
    def forward(self, input_):
        self.z = input_
        return input_.tanh()
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        # g'(z)
        g_prime = (1 - self.z.tanh().pow(2))
        # dz[l]
        return da.mul(g_prime)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

### Linear Module
fully connected layer

In [479]:
class Linear(Module):   
    def __init__(self, in_dim, out_dim):        
        self.w = Tensor(out_dim,in_dim).normal_(0)
        self.b = Tensor(out_dim,1).normal_(0)
        self.x_previous_layer = None
        # sum the gradient wrt w / b for each batch in these variables
        self.grad_w_sum = Tensor(self.w.size()).zero_()
        self.grad_b_sum = Tensor(self.b.size()).zero_()
    
    def forward(self, input_):
        self.x_previous_layer = input_
        return (self.w.mm(input_.t()) + self.b).t()
        
    def backward(self, gradwrtoutput):
        dz = gradwrtoutput.t()
        dw = dz.mm(self.x_previous_layer)
        db = dz
        # sum the gradients for the weights and biases
        print(self.grad_w_sum.size())
        print(dw.size())
        print('-'*20)
        print(self.grad_b_sum.size())
        print(db.sum(1).unsqueeze(1).size())
        self.grad_w_sum += dw
        self.grad_b_sum += db.sum(1)
        return (self.w.t().mm(dz)).t()
        
    def param(self):
        return [ (self.w, self.grad_w_sum), (self.b, self.grad_b_sum) ]
    
    def zero_grad(self):
        self.grad_w_sum.zero_()
        self.grad_b_sum.zero_()

### Sequential Module
to combine several modules in basic sequential structure

In [327]:
class Sequential(Module):    
    def __init__(self, *layers_):
        self.modules = layers_
        
    def forward(self, input_):
        x = input_
        for module in self.modules:
            x = module.forward(x)
        return x
        
    def backward(self, gradwrtoutput):
        x = gradwrtoutput
        for module in reversed(self.modules):
            x = module.backward(x)
        return x
        
    def param(self):
        return [ p for module in self.modules for p in module.param() ]
    
    def zero_grad(self):
        for module in self.modules:
            module.zero_grad()

### MSE Loss Function

In [101]:
class LossMSE(Module): 
    def __init__(self):
        self.error = None
        
    def forward(self, preds, labels):
        self.error = preds - labels
        return self.error.pow(2).sum()
        
    def backward(self):
        return 2 * self.error
        
    def param(self):
        return []

In [465]:
class optim_SGD(Module): 
    def __init__(self, parameters, learning_rate):
        self.param = parameters #[ p.shallow() for tup in parameters for p in tup ]
        self.lr = learning_rate
        
    def step(self):
        for (p, grad_p) in self.param:
            p.sub_(self.lr*grad_p)

### testing

In [267]:
a = FloatTensor(5, 2)
a.normal_(0, 5)
b = FloatTensor(5, 2)
b.normal_(0, 5)


  5.9373   7.2770
 -7.0436   2.5153
  7.2012   3.9569
 18.7668  -2.0585
  3.5095   5.1504
[torch.FloatTensor of size 5x2]

In [73]:
print(a)
print(b)


 -9.4639 -11.7929
 -3.3858 -14.1017
  3.7349   1.5512
  4.7306  -0.6805
  1.5711  -0.4694
[torch.FloatTensor of size 5x2]


-3.9147  5.9485
-4.1990 -1.6456
 8.7344 -7.4869
 2.4498 -8.1163
-0.4472  5.3993
[torch.FloatTensor of size 5x2]



In [291]:
a.sum(1)


  2.4001
 -7.2299
  5.4902
  6.9134
 10.6199
[torch.FloatTensor of size 5]

In [318]:
a.pow(2).sum()

260.3018182516098

In [324]:
target = FloatTensor(5, 4).normal_(0,5)

In [480]:
our_model = Sequential(Linear(2,10), ReLU(), Linear(10,4), ReLU())

our_loss = LossMSE()

our_optim = optim_SGD(our_model.param(), 0.01)

output = our_model.forward(a)

our_loss.forward(output, target)

our_model.backward(our_loss.backward())

#our_optim.step()

torch.Size([4, 10])
torch.Size([4, 10])
--------------------
torch.Size([4, 1])
torch.Size([4, 1])
torch.Size([10, 2])
torch.Size([10, 2])
--------------------
torch.Size([10, 1])
torch.Size([10, 1])


  return self.add_(other)



1.00000e+05 *
 -0.0107  0.0179
 -4.0004 -0.3849
  0.0003  0.0006
  0.0013  0.0031
  0.0021  0.0051
[torch.FloatTensor of size 5x2]

In [461]:
our_model.param()[0][0]


 1.0126  4.5310
-7.2013  4.0270
 3.9680  1.7036
 1.6381 -7.2932
-0.5733  1.6145
 3.9516  5.6916
 2.2626 -1.8835
 7.8173 -0.8127
-4.2478  7.7019
-1.4106 -3.9694
[torch.FloatTensor of size 10x2]

In [455]:
our_optim.step()


    0
    0
    0
    0
    0
    0
    0
    0
    0
    0
[torch.FloatTensor of size 10x1]



In [394]:
our_model.param()[0][0].zero_()


    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
    0     0
[torch.FloatTensor of size 10x2]

In [378]:
print(our_model.param())
print("="*50)
our_optim.step()
print("="*50)
print(our_model.param())

[(
 0.1647  1.5122
 0.5307  0.1592
 1.9198 -2.5079
-1.2089  0.4031
-1.0443 -0.5580
 0.6487 -0.1277
 0.0070 -0.1935
 0.1507 -0.2277
 0.7951 -1.3963
 1.2994  0.1784
[torch.FloatTensor of size 10x2]
, 
  731.1170   153.0622
 4487.7734  -821.4435
  631.4024  -193.0466
  182.6864    30.8173
 -487.6267   -82.2576
 1381.4794    24.7660
-1999.9968   783.3753
 -713.5247   -88.8846
 2720.7378  -558.8494
 2034.0952  -178.8320
[torch.FloatTensor of size 10x2]
), (
-0.9272
-1.3761
-0.7448
-0.2707
 0.1445
-0.6812
 0.2777
 1.0100
-0.8755
 0.7084
[torch.FloatTensor of size 10x1]
, 
  84.6115
 506.7211
  68.2542
 -29.5306
  78.8230
 166.8729
-217.7150
-121.7368
 304.8163
 239.9527
[torch.FloatTensor of size 10]
), (
-1.5164  1.0109 -1.3242 -1.4095 -0.7296  0.1188  0.0954  0.6558  2.2024  0.1759
 1.1016  2.6712  0.4880  1.3274 -1.2304  0.4205 -1.8041 -0.1396  1.7075  0.9373
 0.0361 -0.3390 -0.6613 -0.8050  1.3170  2.0945  1.6856 -1.7166 -0.6874  1.3346
-1.8927  0.6111 -0.7577 -0.4711  1.6690  0.2081 -0.

## Test file

In [233]:
def generate_disc_set(nb):
    a = Tensor(nb, 2).uniform_(0, 1)
    target = (a.pow(2).sum(1) < (2/math.pi)).long()
    return a, target

train_input, train_target = generate_disc_set(1000)
test_input, test_target = generate_disc_set(1000)

In [238]:
def convert_to_one_hot(target):
    tmp = FloatTensor(target.size(0), 2).fill_(0)
    for k in range(0, target.size(0)):
        tmp[k, train_target[k]] = 1
    return tmp

In [243]:
train_one_hot_target = convert_to_one_hot(train_target)
test_one_hot_target = convert_to_one_hot(test_target)

In [466]:
model = Sequential(Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25,2), Tanh())

criterion = LossMSE()
optimizer = optim_SGD(model.param(), 1e-1)
nb_epochs = 50
mini_batch_size = 100

for e in tqdm(range(0, nb_epochs)):
    loss = 0
    for b in range(0, train_input.size(0), mini_batch_size):
        output = model.forward(train_input.narrow(0, b, mini_batch_size))
        loss += criterion.forward(output, train_one_hot_target.narrow(0, b, mini_batch_size))
        model.zero_grad()
        model.backward(criterion.backward())
        optimizer.step()
    print("Epoch", e+1, ":", loss)

  return self.add_(other)
 90%|█████████ | 45/50 [00:00<00:00, 222.15it/s]

Epoch 1 : 2990.264766845929
Epoch 2 : 2963.9999977350235
Epoch 3 : 2963.9999977350235
Epoch 4 : 2963.9999977350235
Epoch 5 : 2963.9999977350235
Epoch 6 : 2963.9999977350235
Epoch 7 : 2963.9999977350235
Epoch 8 : 2963.9999977350235
Epoch 9 : 2963.9999977350235
Epoch 10 : 2963.9999977350235
Epoch 11 : 2963.9999977350235
Epoch 12 : 2963.9999977350235
Epoch 13 : 2963.9999977350235
Epoch 14 : 2963.9999977350235
Epoch 15 : 2963.9999977350235
Epoch 16 : 2963.9999977350235
Epoch 17 : 2963.9999977350235
Epoch 18 : 2963.9999977350235
Epoch 19 : 2963.9999977350235
Epoch 20 : 2963.9999977350235
Epoch 21 : 2963.9999977350235
Epoch 22 : 2963.9999977350235
Epoch 23 : 2963.9999977350235
Epoch 24 : 2963.9999977350235
Epoch 25 : 2963.9999977350235
Epoch 26 : 2963.999997615814
Epoch 27 : 2963.999997615814
Epoch 28 : 2963.999997615814
Epoch 29 : 2963.999997615814
Epoch 30 : 2963.999997615814
Epoch 31 : 2963.999997615814
Epoch 32 : 2963.999997615814
Epoch 33 : 2963.999997615814
Epoch 34 : 2963.999997615814

100%|██████████| 50/50 [00:00<00:00, 220.97it/s]

Epoch 46 : 2963.999997615814
Epoch 47 : 2963.999997615814
Epoch 48 : 2963.999997615814
Epoch 49 : 2963.999997496605
Epoch 50 : 2963.999997496605





In [483]:
 x = Tensor(4,1)

In [484]:
Tensor(x.size())


 0.0000e+00
 1.0842e-19
 1.4138e+10
 4.6577e-10
[torch.FloatTensor of size 4x1]

In [None]:
x = Tensor(10,1).normal_(0)