In [2]:
from torch import FloatTensor, LongTensor, Tensor
import math

## Module Superclass

In [3]:
class Module(object):
    def forward(self, *input_):
        raise NotImplementedError
        
    def backward(self, *gradwrtoutput):
        raise NotImplementedError
        
    def param(self):
        return []

## ReLU Module

ReLU function: 
\begin{equation}
f(x) = max(0, x)
\end{equation}

the derivative of ReLU is

\begin{equation} 
f'(x)=
    \begin{cases}
      1, & \text{if}\ x>0 \\
      0, & \text{otherwise}
    \end{cases}
\end{equation}

In [4]:
# This module represents the ReLU activation function
class ReLU(Module):
    def __init__(self):
        self.z = None
    
    # input_: the tensor outputed by the current layer
    def forward(self, input_):
        self.z = input_.clone()
        input_[input_ < 0] = 0
        return input_
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        tensor = self.z.clone()
        # g'(z)
        tensor[tensor > 0] = 1
        tensor[tensor < 0] = 0
        # dz[l]
        return da.mul(tensor)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

## Tanh Module

In [5]:
class Tanh(Module):   
    def __init__(self):
        self.z = None
    
    # input_: the tensor outputed by the current layer
    def forward(self, input_):
        self.z = input_
        return input_.tanh()
        
    def backward(self, gradwrtoutput):
        da = gradwrtoutput
        # g'(z)
        g_prime = (1 - self.z.tanh().pow(2))
        # dz[l]
        return da.mul(g_prime)
        
    def param(self):
        return []
    
    def zero_grad(self):
        pass

## Linear Module
fully connected layer

In [6]:
class Linear(Module):   
    def __init__(self, in_dim, out_dim):
        # keep track of the weigths, the biases and the output of the previous layer's activation function
        self.w = Tensor(out_dim,in_dim).normal_(0)
        self.b = Tensor(out_dim,1).normal_(0)
        self.x_previous_layer = None
        # init the gradient of the loss wrt w / b
        self.grad_w_sum = Tensor(self.w.size()).zero_()
        self.grad_b_sum = Tensor(self.b.size()).zero_()
    
    # input_: the output of the previous layer's activation function
    def forward(self, input_):
        self.x_previous_layer = input_
        return (self.w.mm(input_.t()) + self.b).t()
        
    def backward(self, gradwrtoutput):
        dz = gradwrtoutput.t()
        dw = dz.mm(self.x_previous_layer)
        db = dz
        # sum the gradients for the weights and biases
        self.grad_w_sum += dw
        self.grad_b_sum += db.sum(1).unsqueeze(1)
        return (self.w.t().mm(dz)).t()
        
    # returns a list of pairs, each composed of a parameter tensor and a gradient tensor
    # parameters: weights and biases
    def param(self):
        return [ (self.w, self.grad_w_sum), (self.b, self.grad_b_sum) ]
    
    def zero_grad(self):
        self.grad_w_sum.zero_()
        self.grad_b_sum.zero_()

## Sequential Module
to combine several modules in basic sequential structure

In [7]:
# This module allows to combine several modules (layers, activation functions) in a basic sequential structure
class Sequential(Module):    
    def __init__(self, *layers_):
        self.modules = layers_
        
    # input_: the input data is a minibatch whose columns are features and lines are samples
    def forward(self, input_):
        x = input_
        for module in self.modules:
            x = module.forward(x)
        return x
        
    def backward(self, gradwrtoutput):
        x = gradwrtoutput
        for module in reversed(self.modules):
            x = module.backward(x)
        return x
        
    # returns a flatened list of each module's parameters
    # each parameter in the list is represented as a tuple containing the parameter tensor (e.g. w)
    # and the gradient tensor (e.g. dl/dw)
    def param(self):
        return [ p for module in self.modules for p in module.param() ]
    
    # s,
    def zero_grad(self):
        for module in self.modules:
            module.zero_grad()

## MSE Loss Function

In [8]:
class LossMSE(Module): 
    def __init__(self):
        self.error = None
        
    def forward(self, preds, labels):
        self.error = preds - labels
        return self.error.pow(2).sum()
        
    def backward(self):
        return 2 * self.error
        
    def param(self):
        return []

## Cross-entropy Loss Function

In [117]:
class LossCrossEntropy(Module): 
    def __init__(self):
        self.p = None
        self.c = None
        
    # stabilized version of softmax
    def softmax(self, t):
        #stable_exp = (t - t.max(1)[0].unsqueeze(1)).exp()
        #return stable_exp / (stable_exp.sum(1).unsqueeze(1))
        stable_exp = (t - t.max()).exp()
        return stable_exp / (stable_exp.sum())
    
    def convert_to_one_hot(self, target):
        tmp = FloatTensor(target.size(0), 2).fill_(0)
        for k in range(0, target.size(0)):
            tmp[k, target[k]] = 1
        return tmp
        
    def forward(self, preds, labels):
        self.p = self.softmax(preds)
        self.c = self.convert_to_one_hot(labels)
        #print(self.p)
        #print(self.c)
        return -(self.c * self.p.log()).sum()
        #return -((self.c * self.p.log()).sum(1) + ((1 - self.c) * (1 - self.p).log()).sum(1)).sum()
        #return -(self.c * self.p.log()).sum()
        
    def backward(self):
        #print(self.p)
        #print(self.c)
        return(self.p-self.c)
        
    def param(self):
        return []

In [116]:
import numpy as np
X = np.array([[0.2, 0.8], [0.3, 0.7],[0.1, 0.9]])
y = np.array([1, 0, 0])

def softmax_np(X):
    exps = np.exp(X - np.max(X))
    return exps / np.sum(exps)

def cross_entropy(X,y):
    """
    X is the output from fully connected layer (num_examples x num_classes)
    y is labels (num_examples x 1)
    """
    m = y.shape[0]
    p = softmax_np(X)
    log_likelihood = -np.log(p[range(m),y])
    print(y)
    print(np.log(p))
    print(log_likelihood)
    loss = np.sum(log_likelihood)
    return loss

#cross_entropy(X, y)

def delta_cross_entropy(X,y):
    """
    X is the output from fully connected layer (num_examples x num_classes)
    y is labels (num_examples x 1)
    """
    m = y.shape[0]
    grad = softmax_np(X)
    print(grad)
    print(y)
    print(grad[range(m),y])
    grad[range(m),y] -= 1
    grad = grad
    return grad

delta_cross_entropy(X, y)

[[0.11772183 0.21450316]
 [0.13010274 0.19409049]
 [0.10651912 0.23706266]]
[1 0 0]
[0.21450316 0.13010274 0.10651912]


array([[ 0.11772183, -0.78549684],
       [-0.86989726,  0.19409049],
       [-0.89348088,  0.23706266]])

In [114]:
t1 = Tensor([[0.2, 0.8], [0.3, 0.7],[0.1, 0.9]])
t2 = LongTensor([1, 0, 0])
test = LossCrossEntropy()
test.forward(t1, t2)
test.backward()


 0.1177  0.2145
 0.1301  0.1941
 0.1065  0.2371
[torch.FloatTensor of size 3x2]




 0.1177 -0.7855
-0.8699  0.1941
-0.8935  0.2371
[torch.FloatTensor of size 3x2]

In [10]:
t = Tensor(2, 2).fill_(0.5)
-(t.exp() / t.exp().sum()).log()

c = Tensor(2, 2).fill_(0.5)
p = Tensor(2, 2).fill_(0.5)

(c * p.log() + (1 - c) * (1 - p).log()).sum()

-2.7725887298583984

In [11]:
t2 = Tensor([[87.2280, 5.0081], 
 [87.2212, 5.0043],
 [87.2355, 5.0180],
 [87.2333, 5.0175],
 [87.2464, 5.0163]]).float()

In [12]:
def softmax(t):
    stable_exp = (t - t.max(1)[0].unsqueeze(1)).exp()
    #print(stable_exp)
    return stable_exp / (stable_exp.sum(1).unsqueeze(1))

## SGD optimization

In [15]:
class optim_SGD(Module):
    # parameters: the parameters of the Sequential module
    def __init__(self, parameters, learning_rate):
        self.param = parameters #[ p.shallow() for tup in parameters for p in tup ]
        self.lr = learning_rate
        
    # performs a gradient step (SGD) for all parameters
    def step(self):
        for (p, grad_p) in self.param:
            p.sub_(self.lr*grad_p)

## TODO

zero_grad() method: should we put it in the Module class, so that we can remove it from ReLU and Tanh?

## Helpers

In [16]:
def generate_disc_set(nb):
    a = Tensor(nb, 2).uniform_(0, 1)
    target = ((a.pow(2).sum(1)).sqrt() < math.sqrt(1/(2*math.pi))).long()
    return a, target

In [17]:
# converts 'target' Tensor to one hot labels
def convert_to_one_hot(target):
    tmp = FloatTensor(target.size(0), 2).fill_(0)
    for k in range(0, target.size(0)):
        tmp[k, target[k]] = 1
    return tmp

In [18]:
def compute_accuracy(model, input_, target):
    nb_data_errors = 0
    output = model.forward(input_)

    _, predicted_classes = output.max(1)
    for k in range(input_.size(0)):
        if target[k] != predicted_classes[k]:
            nb_data_errors = nb_data_errors + 1
    return 100 - (100*(nb_data_errors / input_.size(0)))

In [19]:
def train_model(train_input, train_target, model, criterion, optimizer, nb_epochs=100, mini_batch_size=5, verbose=False):
    for e in range(0, nb_epochs):
        loss = 0
        for b in range(0, train_input.size(0), mini_batch_size):
            output = model.forward(train_input.narrow(0, b, mini_batch_size))
            # sum the loss for each batch to get the current epoch's loss
            loss += criterion.forward(output, train_target.narrow(0, b, mini_batch_size))
            # set the gradients of all layers to zero before the next batch can go through the network
            model.zero_grad()
            model.backward(criterion.backward())
            optimizer.step() # performs a gradient step to optimize the parameters
        if verbose:
            print("Epoch", e+1, ":", loss)

## Test file

In [20]:
# generate training and testing data
train_input, train_target = generate_disc_set(1000)
test_input, test_target = generate_disc_set(1000)

# convert targets to one hot labels
train_one_hot_target = convert_to_one_hot(train_target)
test_one_hot_target = convert_to_one_hot(test_target)

In [118]:
nb_epochs = 100
mini_batch_size = 5
model = Sequential(Linear(2,25), ReLU(), Linear(25,25), ReLU(), Linear(25, 25), ReLU(), Linear(25,2))
#criterion = LossMSE()
criterion = LossCrossEntropy()
optimizer = optim_SGD(model.param(), 1e-3)

train_model(train_input, train_one_hot_target, model, criterion, optimizer, nb_epochs, mini_batch_size, verbose=True)

print("Train accuracy :", compute_accuracy(model, train_input, train_target), "%")
print("Test accuracy :", compute_accuracy(model, test_input, test_target), "%")

TypeError: Performing basic indexing on a tensor and encountered an error indexing dim 1 with an object of type torch.FloatTensor. The only supported types are integers, slices, numpy scalars, or if indexing with a torch.LongTensor or torch.ByteTensor only a single Tensor may be passed.

In [241]:
n_iters = 50
test_acc=[]
train_acc=[]
for i in range(n_iters):
    print('iter', i+1)
    model = Sequential(Linear(2,25), Tanh(), Linear(25,25), Tanh(), Linear(25, 25), Tanh(), Linear(25,2))
    #criterion = LossMSE()
    criterion = LossCrossEntropy()
    optimizer = optim_SGD(model.param(), 1e-3)
    
    train_model(model, criterion, optimizer, nb_epochs, mini_batch_size)

    train_a = compute_accuracy(model, train_input, train_target)
    test_a = compute_accuracy(model, test_input, test_target)
    print("Train accuracy :", train_a, "%")
    print("Test accuracy :", test_a, "%")
    train_acc.append(train_a)
    test_acc.append(test_a)

iter 1
Train accuracy : 87.0 %
Test accuracy : 86.1 %
iter 2
Train accuracy : 87.0 %
Test accuracy : 86.1 %
iter 3
Train accuracy : 87.0 %
Test accuracy : 86.1 %
iter 4


KeyboardInterrupt: 

In [202]:
print('train accuracy', sum(train_acc)/len(train_acc))
print('test accuracy', sum(test_acc)/len(test_acc))

train accuracy 99.456
test accuracy 99.17
