In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

In [5]:
class RNN(nn.Module):

    # you can also accept arguments in your model constructor
    def __init__(self, data_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        input_size = data_size + hidden_size

        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)

    def forward(self, data, last_hidden):
        input = torch.cat((data, last_hidden), 1)
        hidden = self.i2h(input)
        output = self.h2o(hidden)
        return hidden, output

rnn = RNN(50, 20, 10).cuda()

In [6]:
loss_fn = nn.MSELoss()

batch_size = 10
TIMESTEPS = 5

# Create some fake data
batch = Variable(torch.randn(batch_size, 50)).cuda()
hidden = Variable(torch.zeros(batch_size, 20)).cuda()
target = Variable(torch.zeros(batch_size, 10)).cuda()

loss = 0
for t in range(TIMESTEPS):
    # yes! you can reuse the same network several times,
    # sum up the losses, and call backward!
    hidden, output = rnn(batch, hidden)
    loss += loss_fn(output, target)
loss.backward()

In [7]:
loss

Variable containing:
 0.4756
[torch.cuda.FloatTensor of size 1 (GPU 0)]

In [10]:
import torch
from torch.autograd import Variable

class MyReLU(torch.autograd.Function):
    def forward(self, input):
        self.save_for_backward(input)
        return input.clamp(min=0)

    def backward(self, grad_output):
        input, = self.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


In [11]:
dtype = torch.FloatTensor
# dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in).type(dtype), requires_grad=False)
y = Variable(torch.randn(N, D_out).type(dtype), requires_grad=False)

# Create random Tensors for weights, and wrap them in Variables.
w1 = Variable(torch.randn(D_in, H).type(dtype), requires_grad=True)
w2 = Variable(torch.randn(H, D_out).type(dtype), requires_grad=True)

In [12]:
learning_rate = 1e-6
for t in range(500):
  # Construct an instance of our MyReLU class to use in our network
  relu = MyReLU()

  # Forward pass: compute predicted y using operations on Variables; we compute
  # ReLU using our custom autograd operation.
  y_pred = relu(x.mm(w1)).mm(w2)

  # Compute and print loss
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.data[0])

  # Use autograd to compute the backward pass.
  loss.backward()

  # Update weights using gradient descent
  w1.data -= learning_rate * w1.grad.data
  w2.data -= learning_rate * w2.grad.data

  # Manually zero the gradients after updating weights
  w1.grad.data.zero_()
  w2.grad.data.zero_()


0 24572762.0
1 17448632.0
2 15674968.0
3 16505464.0
4 18559984.0
5 20345608.0
6 20450922.0
7 18048208.0
8 13789528.0
9 9236118.0
10 5643547.0
11 3299696.5
12 1937154.375
13 1184139.625
14 771703.5
15 540213.0
16 403865.5
17 318270.40625
18 260597.203125
19 219079.0625
20 187485.390625
21 162384.359375
22 141827.21875
23 124618.7421875
24 109995.578125
25 97436.234375
26 86574.5859375
27 77123.7109375
28 68884.75
29 61671.0546875
30 55326.63671875
31 49729.0703125
32 44782.12109375
33 40396.0859375
34 36499.046875
35 33028.2109375
36 29931.47265625
37 27163.41796875
38 24687.201171875
39 22464.80859375
40 20467.38671875
41 18671.06640625
42 17054.294921875
43 15594.4765625
44 14274.9150390625
45 13080.216796875
46 11997.7509765625
47 11015.7158203125
48 10123.1552734375
49 9310.8349609375
50 8571.3115234375
51 7896.57177734375
52 7280.89599609375
53 6718.4033203125
54 6204.630859375
55 5734.5048828125
56 5303.37353515625
57 4907.61572265625
58 4544.28076171875
59 4210.3671875
60 3903.25

In [13]:
import torch
from torch.autograd import Variable

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables.
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Use the nn package to define our model as a sequence of layers. nn.Sequential
# is a Module which contains other Modules, and applies them in sequence to
# produce its output. Each Linear Module computes output from input using a
# linear function, and holds internal Variables for its weight and bias.
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        )

# The nn package also contains definitions of popular loss functions; in this
# case we will use Mean Squared Error (MSE) as our loss function.
loss_fn = torch.nn.MSELoss(size_average=False)



In [14]:
learning_rate = 1e-4
for t in range(500):
  # Forward pass: compute predicted y by passing x to the model. Module objects
  # override the __call__ operator so you can call them like functions. When
  # doing so you pass a Variable of input data to the Module and it produces
  # a Variable of output data.
  y_pred = model(x)

  # Compute and print loss. We pass Variables containing the predicted and true
  # values of y, and the loss function returns a Variable containing the loss.
  loss = loss_fn(y_pred, y)
  print(t, loss.data[0])
  
  # Zero the gradients before running the backward pass.
  model.zero_grad()

  # Backward pass: compute gradient of the loss with respect to all the learnable
  # parameters of the model. Internally, the parameters of each Module are stored
  # in Variables with requires_grad=True, so this call will compute gradients for
  # all learnable parameters in the model.
  loss.backward()

  # Update the weights using gradient descent. Each parameter is a Variable, so
  # we can access its data and gradients like we did before.
  for param in model.parameters():
    param.data -= learning_rate * param.grad.data

0 689.8340454101562
1 636.1437377929688
2 590.5167236328125
3 551.1314697265625
4 516.6248779296875
5 485.6264343261719
6 457.4693298339844
7 431.8029479980469
8 408.3084411621094
9 386.571533203125
10 366.4464416503906
11 347.69671630859375
12 330.1016540527344
13 313.5181579589844
14 297.81640625
15 282.8961181640625
16 268.7088928222656
17 255.20001220703125
18 242.27272033691406
19 229.98085021972656
20 218.28599548339844
21 207.07884216308594
22 196.35353088378906
23 186.06716918945312
24 176.26417541503906
25 166.93894958496094
26 158.0320587158203
27 149.54640197753906
28 141.44764709472656
29 133.7354736328125
30 126.38764190673828
31 119.40802001953125
32 112.7530288696289
33 106.43074798583984
34 100.43157196044922
35 94.75811004638672
36 89.3875961303711
37 84.2991943359375
38 79.480712890625
39 74.91854858398438
40 70.60263061523438
41 66.53007507324219
42 62.68637466430664
43 59.058101654052734
44 55.628578186035156
45 52.396156311035156
46 49.35190963745117
47 46.48133850

In [16]:
import torch
from torch.autograd import Variable

class TwoLayerNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

  def forward(self, x):
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))
y = Variable(torch.randn(N, D_out), requires_grad=False)

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)


In [17]:
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = criterion(y_pred, y)
  print(t, loss.data[0])

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

0 643.0243530273438
1 595.5492553710938
2 554.590087890625
3 519.0363159179688
4 487.38214111328125
5 458.8332824707031
6 432.8763732910156
7 409.1353759765625
8 387.2763671875
9 367.0556640625
10 348.2125244140625
11 330.5260314941406
12 313.8564758300781
13 298.0380554199219
14 283.0099792480469
15 268.70745849609375
16 255.10940551757812
17 242.13999938964844
18 229.7659912109375
19 217.9561004638672
20 206.6938018798828
21 195.91355895996094
22 185.689208984375
23 175.9453887939453
24 166.6654510498047
25 157.82672119140625
26 149.46337890625
27 141.52899169921875
28 133.9996795654297
29 126.83983612060547
30 120.04042053222656
31 113.58797454833984
32 107.46755981445312
33 101.66817474365234
34 96.18196105957031
35 90.99839782714844
36 86.09934997558594
37 81.46709442138672
38 77.09391784667969
39 72.95397186279297
40 69.04399108886719
41 65.34394073486328
42 61.854774475097656
43 58.556312561035156
44 55.443702697753906
45 52.507972717285156
46 49.73945617675781
47 47.12308502197

In [20]:
import random
import torch
from torch.autograd import Variable

class DynamicNet(torch.nn.Module):
  def __init__(self, D_in, H, D_out):
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

  def forward(self, x):
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in).cuda())
y = Variable(torch.randn(N, D_out).cuda(), requires_grad=False)

# Construct our model by instantiating the class defined above
model = DynamicNet(D_in, H, D_out).cuda()

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(size_average=False)
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)


In [21]:
for t in range(500):
  # Forward pass: Compute predicted y by passing x to the model
  y_pred = model(x)

  # Compute and print loss
  loss = criterion(y_pred, y)
  print(t, loss.data[0])

  # Zero gradients, perform a backward pass, and update the weights.
  optimizer.zero_grad()
  loss.backward()
  optimizer.step()

0 619.101806640625
1 617.2335205078125
2 613.8818969726562
3 620.6778564453125
4 634.2705688476562
5 611.875
6 593.324462890625
7 581.1236572265625
8 492.7060546875
9 554.492919921875
10 605.7705078125
11 588.48095703125
12 585.7365112304688
13 581.6905517578125
14 576.2718505859375
15 494.08551025390625
16 562.0896606445312
17 466.8683166503906
18 294.36016845703125
19 269.7208557128906
20 410.9238586425781
21 204.3607635498047
22 171.6367645263672
23 348.3695983886719
24 118.25515747070312
25 304.83001708984375
26 278.8433837890625
27 83.10514831542969
28 423.6156921386719
29 200.48236083984375
30 87.9278564453125
31 86.89578247070312
32 479.09979248046875
33 147.5576629638672
34 66.51579284667969
35 285.3500671386719
36 383.5586242675781
37 49.60307312011719
38 43.913543701171875
39 34.03856658935547
40 279.01226806640625
41 250.75416564941406
42 108.7726058959961
43 225.26535034179688
44 97.77766418457031
45 154.73336791992188
46 155.2713165283203
47 75.13819885253906
48 130.756393