In [16]:
from torch import FloatTensor, LongTensor, Tensor

In [50]:
class Module(object):
    
    def forward(self, *input_):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []

### ReLU Module

ReLU function: 
\begin{equation}
f(x) = max(0, x)
\end{equation}

the derivative of ReLU is

\begin{equation} 
f'(x)=
    \begin{cases}
      1, & \text{if}\ x>0 \\
      0, & \text{otherwise}
    \end{cases}
\end{equation}

In [51]:
class ReLU(Module):
    def __init__(self):
        self.z = None
    
    def forward(self, input_):
        self.z = input_.clone()
        input_[input_ < 0] = 0
        return input_
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        tensor = self.z.clone()
        # g'(z)
        tensor[tensor > 0] = 1
        tensor[tensor < 0] = 0
        # dz[l]
        return da.mul(tensor)
        
    def param(self):
        return []

### Tanh Module

In [52]:
class Tanh(Module):   
    def __init__(self):
        self.z = None
    
    def forward(self, input_):
        self.z = input_
        return input_.tanh()
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        # g'(z)
        g_prime = (1 - self.z.tanh().pow(2))
        # dz[l]
        return da.mul(tensor)
        
    def param(self):
        return []

### Linear Module
fully connected layer

In [122]:
class Linear(Module):   
    def __init__(self, in_dim, out_dim):        
        self.w = Tensor(out_dim,in_dim).normal_(0)
        self.b = Tensor(out_dim,1).normal_(0)
        self.x_previous_layer = None
        # sum the gradient wrt w / b for each batch in these variables
        self.grad_w_sum = Tensor(self.w.size()).zero_()
        self.grad_b_sum = Tensor(self.b.size()).zero_()
    
    def forward(self, input_):
        self.x_previous_layer = input_
        return (self.w.mm(input_.t()) + self.b).t()
        
    def backward(self, gradwrtoutput):
        dz = gradwrtoutput.t()
        dw = dz.mm(self.x_previous_layer)
        db = dz
        print(dw.size())
        print(db.size())
        # sum the gradients for the weights and biases
        self.grad_w_sum.add(dw)
        self.grad_b_sum.add(db)
        
        return self.w.t().mm(dz)
        
    def param(self):
        # TODO
        return []
    
    def zero_grad(self):
        self.grad_w_sum.zero_()
        self.grad_b_sum.zero_()
        
    def SGD_step(self, lr):
        self.w -= lr*self.grad_w_sum
        self.b -= lr*self.grad_b_sum

### Sequential Module
to combine several modules in basic sequential structure

In [116]:
class Sequential(Module):    
    def __init__(self, *layers_):
        self.modules = layers_
        
    def forward(self, input_):
        x = input_
        for module in self.modules:
            x = module.forward(x)
        return x
        
    def backward(self, gradwrtoutput):
        x = gradwrtoutput
        for module in self.modules:
            x = module.backward(x)
        return x
        
    def param(self):
        # TODO
        return []

### MSE Loss Function TODO

In [101]:
class LossMSE(Module): 
    def __init__(self):
        self.error = None
        
    def forward(self, preds, labels):
        self.error = preds - labels
        return self.error.pow(2).sum()
        
    def backward(self):
        return 2 * self.error
        
    def param(self):
        # TODO
        return []

### testing

In [72]:
a = FloatTensor(5, 2)
a.normal_(0, 5)
b = FloatTensor(5, 2)
b.normal_(0, 5)


-3.9147  5.9485
-4.1990 -1.6456
 8.7344 -7.4869
 2.4498 -8.1163
-0.4472  5.3993
[torch.FloatTensor of size 5x2]

In [73]:
print(a)
print(b)


 -9.4639 -11.7929
 -3.3858 -14.1017
  3.7349   1.5512
  4.7306  -0.6805
  1.5711  -0.4694
[torch.FloatTensor of size 5x2]


-3.9147  5.9485
-4.1990 -1.6456
 8.7344 -7.4869
 2.4498 -8.1163
-0.4472  5.3993
[torch.FloatTensor of size 5x2]



In [74]:
x = ReLU().forward(a)
type(x)

torch.FloatTensor

In [79]:
target = FloatTensor(5, 10).normal_(0,5)

In [123]:
our_model = Sequential(Linear(2,10),ReLU())

our_loss = LossMSE()

output = our_model.forward(a)

our_loss.forward(output, target)

2242.38652664423

In [124]:
our_model.backward(our_loss.backward())

torch.Size([10, 2])
torch.Size([10, 5])


RuntimeError: inconsistent tensor size, expected r_ [2 x 5], t [2 x 5] and src [5 x 10] to have the same number of elements, but got 10, 10 and 50 elements respectively at /Users/soumith/code/builder/wheel/pytorch-src/torch/lib/TH/generic/THTensorMath.c:1036

In [115]:
our_loss.backward()



Columns 0 to 7 
 -5.2156   8.9197 -20.2387   9.5524  12.1348   7.4901   2.8185  28.8327
 12.3700  -1.9795  11.1196  16.2809  -0.5563  -1.1511 -22.5246  -2.9792
 12.2615  11.4033   7.3916  21.8851   2.4965  -1.7212  -3.4069  -2.0032
  3.0220  20.7735   8.1369   5.1801   3.6803  14.6389   3.6854 -22.3885
  6.3670  23.5802  -3.6112  -3.5011   7.9782 -15.0273  -7.8661 -21.9332

Columns 8 to 9 
-10.9242  26.9693
  9.2024 -11.0887
 16.4431  18.5549
 17.0725  18.1277
 10.1746   7.5075
[torch.FloatTensor of size 5x10]

In [125]:
output


 0.0931  0.0000  0.0000  0.8406  0.0000  0.0111  0.0000  0.0000  0.0000  0.3615
 0.0931  0.0000  0.0000  0.8406  0.0000  0.0111  0.0000  0.0000  0.0000  0.3615
 3.4060  0.1470  0.0000  1.3889  5.7810  4.9087  4.7950  0.0000  5.7277  0.0000
 1.5007  0.6573  0.0000  2.5456  6.2245  3.5140  4.2874  0.0000  6.7593  0.0000
 0.5606  0.0000  0.0000  1.4068  1.4057  1.1744  1.1125  0.0000  1.8837  0.0000
[torch.FloatTensor of size 5x10]