In [None]:
# SOURCE: http://seba1511.net/tutorials/beginner/pytorch_with_examples.html#annotations:E9HdvPynEemYwidYvwe30g

In [None]:
# When using autograd, the forward pass of your 
# network will define a computational graph; 
# nodes in the graph will be Tensors, and edges 
# will be functions that produce output Tensors 
# from input Tensors. Backpropagating through this 
#  graph then allows you to easily compute gradients.

# We wrap our PyTorch Tensors in Variable objects; 
# a Variable represents a node in a computational graph.
# If x is a Variable then x.data is a Tensor, 
# and x.grad is another Variable holding the gradient 
# of x with respect to some scalar value.

# PyTorch Variables have the same API as PyTorch 
# Tensors: (almost) any operation that you can perform 
# on a Tensor also works on Variables; the difference 
# is that using Variables defines a computational graph,
# allowing you to automatically compute gradients.

In [1]:
import torch
from torch.autograd import Variable 

In [2]:
dtype = torch.FloatTensor
dtype
# dtype = torch.cuda.FloatTensor # runs on GPU

torch.FloatTensor

In [3]:
# N = batch size
# D_int = input dimension
# H = hidden dimension
# D_out = output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

In [4]:
# Create random Tensors to hold input and outputs, and wrap them in 
# Variables.
# Setting requires_grad=False indicates that we do not need to compute 
# gradients
# with respect to these Variables during the backward pass.
X = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
print(X)
Y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
#print(Y)

tensor([[ 1.2877,  1.7476, -0.9897,  ...,  0.9788,  1.8288, -2.5328],
        [-0.2216, -1.1619, -0.3167,  ..., -1.2521, -0.1662,  1.4233],
        [-0.8671,  0.2041,  0.8794,  ...,  0.0019,  0.3129, -0.2765],
        ...,
        [-1.1136, -0.3622,  0.0525,  ...,  0.0321, -0.9496, -0.5120],
        [ 0.4586, -0.4575, -0.1369,  ...,  0.2476, -0.3643,  0.0021],
        [-1.6410,  1.4266,  0.2169,  ..., -0.3201, -0.3449, -0.5649]])


In [12]:
# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients 
# with respect to these Variables during the backward pass.
W1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
W2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

In [13]:

print(W1.size())
print(W1.dim())

print(W1)

torch.Size([1000, 100])
2
tensor([[-0.6212, -0.9507,  0.1739,  ..., -0.2685, -0.2253,  0.1302],
        [ 0.3207,  1.5027, -0.9396,  ...,  0.5319,  1.3009, -0.2934],
        [ 0.2509,  0.5461, -0.9063,  ..., -2.0845,  0.6393,  1.6020],
        ...,
        [-1.6332,  0.0274,  0.2031,  ..., -1.8663,  0.7127, -1.5207],
        [ 0.7908,  0.4172, -1.3753,  ..., -1.0964, -0.3508,  0.7271],
        [-0.0022,  0.1598,  0.1027,  ...,  0.3834,  1.9736, -1.7433]],
       requires_grad=True)


In [14]:
print(W2.size())
print(W2.dim())

#print(W2)

torch.Size([100, 10])
2


In [18]:
learningRate = 1e-6
NUM_ITER = 500

for t in range(NUM_ITER):
    # Forward pass: compute predicted y using operations on Variables; 
    # these  are exactly the same operations we used to compute the 
    # forward pass using Tensors, but we do not need to keep 
    # references to intermediate values since we are not implementing 
    # the backward pass by hand.
    
    h = X.mm(W1) # activation for hidden layer
    hRELU = h.clamp(min = 0)
    yPred = hRELU.mm(W2) # activation for output layer

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor 
    # of shape (1,); loss.data[0] is a scalar value holding 
    # the loss.
    loss = (yPred - Y).pow(2).sum()
    
    if t % 50 == 0:
        print("iter = ", t, "; loss = ", loss)

    
    #gradYPred = 2.0 * (yPred - Y)
    #gradW2 = hRELU.t().mm(gradYPred)
    #gradHiddenRELU = gradYPred.mm(W2.t())
    #gradH = gradHiddenRELU.clone()
    #gradH[h < 0] = 0
    #gradW1 = X.t().mm(gradH)


    # Use autograd to compute the backward pass. This call will 
    # compute the gradient of loss with respect to all Variables 
    # with requires_grad=True. After this call w1.grad and w2.grad 
    # will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Learning rule: Update weights
    # Update weights using gradient descent; w1.data and w2.data are 
    # Tensors, w1.grad and w2.grad are Variables and w1.grad.data 
    # and w2.grad.data are Tensors.
    W1.data -= learningRate * W1.grad.data # gradW1
    W2.data -= learningRate * W2.grad.data # gradW2
    
    # Necessary state-maintenance step: manually set the gradients to 
    # zero after updating weights (??)
    W1.grad.data.zero_()
    W2.grad.data.zero_()
    


iter =  0 ; loss =  tensor(1.7772, grad_fn=<SumBackward0>)


iter =  50 ; loss =  tensor(0.8522, grad_fn=<SumBackward0>)
iter =  100 ; loss =  tensor(0.4124, grad_fn=<SumBackward0>)


iter =  150 ; loss =  tensor(0.2012, grad_fn=<SumBackward0>)
iter =  200 ; loss =  tensor(0.0990, grad_fn=<SumBackward0>)
iter =  250 ; loss =  tensor(0.0492, grad_fn=<SumBackward0>)


iter =  300 ; loss =  tensor(0.0249, grad_fn=<SumBackward0>)
iter =  350 ; loss =  tensor(0.0129, grad_fn=<SumBackward0>)
iter =  400 ; loss =  tensor(0.0069, grad_fn=<SumBackward0>)


iter =  450 ; loss =  tensor(0.0039, grad_fn=<SumBackward0>)
