In [None]:
# SOURCE: http://seba1511.net/tutorials/beginner/pytorch_with_examples.html#annotations:E9HdvPynEemYwidYvwe30g

In [None]:
# When using autograd, the forward pass of your 
# network will define a computational graph; 
# nodes in the graph will be Tensors, and edges 
# will be functions that produce output Tensors 
# from input Tensors. Backpropagating through this 
#  graph then allows you to easily compute gradients.

# We wrap our PyTorch Tensors in Variable objects; 
# a Variable represents a node in a computational graph.
# If x is a Variable then x.data is a Tensor, 
# and x.grad is another Variable holding the gradient 
# of x with respect to some scalar value.

# PyTorch Variables have the same API as PyTorch 
# Tensors: (almost) any operation that you can perform 
# on a Tensor also works on Variables; the difference 
# is that using Variables defines a computational graph,
# allowing you to automatically compute gradients.

In [3]:
import torch
from torch.autograd import Variable 

In [4]:
dtype = torch.FloatTensor
dtype
# dtype = torch.cuda.FloatTensor # runs on GPU

torch.FloatTensor

In [5]:
# N = batch size
# D_int = input dimension
# H = hidden dimension
# D_out = output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

In [6]:
# Create random Tensors to hold input and outputs, and wrap them in 
# Variables.
# Setting requires_grad=False indicates that we do not need to compute 
# gradients
# with respect to these Variables during the backward pass.
X = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
print(X)
Y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)
#print(Y)

tensor([[-0.3766, -0.1936, -0.5011,  ..., -1.1732, -0.9163,  0.9096],
        [-0.4446, -0.4733, -1.0679,  ...,  2.5234,  0.3103, -1.0592],
        [ 0.9649, -0.6487, -0.7849,  ..., -0.0245, -0.6961, -0.1528],
        ...,
        [-1.1061,  0.2995, -0.1649,  ..., -0.8803, -0.0596, -0.6238],
        [ 1.0835,  1.4910, -0.1038,  ..., -0.7648,  0.5591,  0.2665],
        [-0.2231, -0.4785,  1.0952,  ..., -0.7110, -0.4988, -0.1576]])


In [7]:
# Create random Tensors for weights, and wrap them in Variables.
# Setting requires_grad=True indicates that we want to compute gradients 
# with respect to these Variables during the backward pass.
W1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
W2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

In [8]:

print(W1.size())
print(W1.dim())

print(W1)

torch.Size([1000, 100])
2
tensor([[ 0.4023,  0.7879, -0.9253,  ...,  0.6129,  0.6854, -0.1731],
        [ 0.4399, -1.3762, -0.7482,  ...,  0.5881, -0.9118, -0.7574],
        [-0.3286, -0.6265,  1.7217,  ...,  0.4157,  1.0880, -0.1702],
        ...,
        [ 0.2141, -1.0338,  1.1188,  ...,  1.8133,  0.3519, -1.0181],
        [-1.1927,  1.1318,  0.3350,  ..., -1.4112,  0.0026,  1.6291],
        [-0.0387,  0.3654,  0.2027,  ..., -0.2694,  0.3746, -0.2575]],
       requires_grad=True)


In [9]:
print(W2.size())
print(W2.dim())

#print(W2)

torch.Size([100, 10])
2


In [11]:
learningRate = 1e-6
NUM_ITER = 500

for t in range(NUM_ITER):
    # Forward pass: compute predicted y using operations on Variables; 
    # these  are exactly the same operations we used to compute the 
    # forward pass using Tensors, but we do not need to keep 
    # references to intermediate values since we are not implementing 
    # the backward pass by hand.
    
    h = X.mm(W1) # activation for hidden layer
    hRELU = h.clamp(min = 0)
    yPred = hRELU.mm(W2) # activation for output layer

    # Compute and print loss using operations on Variables.
    # Now loss is a Variable of shape (1,) and loss.data is a Tensor 
    # of shape (1,); loss.data[0] is a scalar value holding 
    # the loss.
    loss = (yPred - Y).pow(2).sum()
    
    if t % 50 == 0:
        print("iter = ", t, "; loss = ", loss.data[0])

    
    #gradYPred = 2.0 * (yPred - Y)
    #gradW2 = hRELU.t().mm(gradYPred)
    #gradHiddenRELU = gradYPred.mm(W2.t())
    #gradH = gradHiddenRELU.clone()
    #gradH[h < 0] = 0
    #gradW1 = X.t().mm(gradH)


    # Use autograd to compute the backward pass. This call will 
    # compute the gradient of loss with respect to all Variables 
    # with requires_grad=True. After this call w1.grad and w2.grad 
    # will be Variables holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Learning rule: Update weights
    # Update weights using gradient descent; w1.data and w2.data are 
    # Tensors, w1.grad and w2.grad are Variables and w1.grad.data 
    # and w2.grad.data are Tensors.
    W1.data -= learningRate * W1.grad.data # gradW1
    W2.data -= learningRate * W2.grad.data # gradW2
    
    # Necessary state-maintenance step: manually set the gradients to 
    # zero after updating weights (??)
    W1.grad.data.zero_()
    W2.grad.data.zero_()
    




iter =  0 ; loss =  tensor(0.0001)


iter =  50 ; loss =  tensor(0.0000)


iter =  100 ; loss =  tensor(0.0000)
iter =  150 ; loss =  tensor(0.0000)
iter =  200 ; loss =  tensor(0.0000)


iter =  250 ; loss =  tensor(0.0000)


iter =  300 ; loss =  tensor(8.6453e-06)
iter =  350 ; loss =  tensor(7.3214e-06)


iter =  400 ; loss =  tensor(6.2060e-06)
iter =  450 ; loss =  tensor(5.4699e-06)
