### Pytorch Tutorial Part 1 (Intro to pytorch, linear regression and neural networks)
Alan Ritter, Ohio State University

Portions adapted from Pytorch tutorial by Justin Johnson

In [2]:
import torch as th

### First, let's generate a random (regression) dataset

In [18]:
in_dim = 10
out_dim = 5

#Randomly generate a dataset.
X     = th.rand(50,in_dim)
Y     = th.randn(50,out_dim)

print(X[0,:])
print(Y[0,:])

tensor([[0.7208, 0.6609, 0.6895, 0.1177, 0.4334, 0.1400, 0.8343, 0.9763, 0.2755,
         0.5469],
        [0.7759, 0.3053, 0.6611, 0.6532, 0.3164, 0.2139, 0.7463, 0.5057, 0.8818,
         0.4592],
        [0.1251, 0.9306, 0.7558, 0.8154, 0.4443, 0.0165, 0.7029, 0.4335, 0.8956,
         0.8348],
        [0.8063, 0.3091, 0.3728, 0.4536, 0.2751, 0.9637, 0.6239, 0.8424, 0.3730,
         0.1390],
        [0.4389, 0.6490, 0.1259, 0.0055, 0.0930, 0.2968, 0.2896, 0.4596, 0.6572,
         0.0835],
        [0.8341, 0.7809, 0.6508, 0.0366, 0.6756, 0.2515, 0.1661, 0.0022, 0.2199,
         0.1409],
        [0.0953, 0.6206, 0.3546, 0.6872, 0.7086, 0.3697, 0.0729, 0.8270, 0.3981,
         0.1874],
        [0.3903, 0.5942, 0.7852, 0.2264, 0.5011, 0.3918, 0.5273, 0.0711, 0.9862,
         0.2251],
        [0.9153, 0.1766, 0.2200, 0.4600, 0.1372, 0.2017, 0.2361, 0.6755, 0.3285,
         0.4383],
        [0.8806, 0.5171, 0.0113, 0.3275, 0.9627, 0.0665, 0.7663, 0.2118, 0.9866,
         0.1044],
        [0

### Linear Regression

In [4]:
#Initalize Parameters
W = th.randn(in_dim,out_dim)

In [5]:
learning_rate = 0.001

for i in range(1000):
    #Predict outputs
    output = X.mm(W)
    
    #Squared error
    loss = (output - Y).pow(2).sum()
    
    if i%100 == 0:
        #print((output.shape, X.shape, W.shape))
        print("loss=%s" % loss)
    
    #Compute Gradient
    error  = 2.0 * (output - Y)
    grad_W = X.t().mm(error)
    
    #Update the parameters
    W  -= learning_rate * grad_W

loss=tensor(1339.5227)
loss=tensor(238.7469)
loss=tensor(207.9514)
loss=tensor(201.3121)
loss=tensor(199.3423)
loss=tensor(198.5982)
loss=tensor(198.2677)
loss=tensor(198.1061)
loss=tensor(198.0230)
loss=tensor(197.9788)


### Initialize Weights for a Neural Network with One Hidden Layer

In [6]:
hidden_dim = 100

#Initialize Weights (initialized from Gaussian distribution with 0 mean and 1 variance)
W_in  = th.randn(in_dim,hidden_dim)
W_out = th.randn(hidden_dim,out_dim)

#print(W_in)
#print(W_out)

### Train the network with manually computed gradients

In [7]:
learning_rate = 0.001

for i in range(10000):
    #Forward Pass
    h         = X.mm(W_in)
    h_sigmoid = th.sigmoid(h)
    output    = h_sigmoid.mm(W_out)
    
    #Squared error
    loss = (output - Y).pow(2).sum()
    
    if i%1000 == 0:
        print("loss=%s" % loss)
    
    #Backward Pass (compute gradients)
    grad_output     = 2.0 * (output - Y)
    grad_W_out      = h_sigmoid.t().mm(grad_output)
    grad_h_sigmoid  = grad_output.mm(W_out.t())
    grad_h          = grad_h_sigmoid * h_sigmoid * (1-h_sigmoid)
    grad_W_in       = X.t().mm(grad_h)
    
    #Update the parameters
    W_in  -= learning_rate * grad_W_in
    W_out -= learning_rate * grad_W_out
        

loss=tensor(4808.4966)
loss=tensor(147.8679)
loss=tensor(92.2713)
loss=tensor(58.1469)
loss=tensor(38.0554)
loss=tensor(25.1213)
loss=tensor(16.7602)
loss=tensor(11.4410)
loss=tensor(7.9654)
loss=tensor(5.6048)


### Now, the same thing, but with gradients computed automatically

In [17]:
import torch
from torch.autograd import Variable

data_type = torch.FloatTensor # CPU
# data_type = torch.cuda.FloatTensor # GPU

hidden_dim = 100

#Initialize Weights (initialized from Gaussian distribution with 0 mean and 1 variance)
W_in  = Variable(th.randn(in_dim,hidden_dim).type(data_type), requires_grad=True)
W_out = Variable(th.randn(hidden_dim,out_dim).type(data_type), requires_grad=True)

X_var = Variable(X.type(data_type), requires_grad=False)
Y_var = Variable(Y.type(data_type), requires_grad=False)

learning_rate = 1e-3

for i in range(10000):
    #Forward Pass
    output = X_var.mm(W_in).sigmoid().mm(W_out)
    
    #Squared error
    loss = (output - Y_var).pow(2).sum()
    
    if i%1000 == 0:
        print("loss=%s" % loss.item())
    
    #The backward pass is very simple:
    loss.backward()
    
    ################################################################
    # Now, we don't have to do this by hand!
    ################################################################
    #grad_output     = 2.0 * (output - Y)
    #grad_W_out      = h_sigmoid.t().mm(grad_output)
    #grad_h_sigmoid  = grad_output.mm(W_out.t())
    #grad_h          = grad_h_sigmoid * h_sigmoid * (1-h_sigmoid)
    #grad_W_in       = X.t().mm(grad_h)
    ################################################################
    
    #Update the parameters
    W_in.data  -= learning_rate * W_in.grad.data
    W_out.data -= learning_rate * W_out.grad.data
    
    W_in.grad.data.zero_()
    W_out.grad.data.zero_()

loss=9548.947265625
loss=151.18746948242188
loss=95.04609680175781
loss=59.54445266723633
loss=38.978336334228516
loss=25.69999885559082
loss=17.442073822021484
loss=12.360021591186523
loss=9.058777809143066
loss=6.748473644256592


### Let's do it again,  this time using the nn package

In [14]:
from torch.autograd import Variable

data_type = torch.FloatTensor # CPU
# data_type = torch.cuda.FloatTensor # GPU

hidden_dim = 100

X_var = Variable(X.type(data_type), requires_grad=False)
Y_var = Variable(Y.type(data_type), requires_grad=False)

model = torch.nn.Sequential(
    torch.nn.Linear(in_dim, hidden_dim),
    torch.nn.Sigmoid(),
    torch.nn.Linear(hidden_dim,out_dim)
)

mse_loss = torch.nn.MSELoss()

learning_rate = 0.01


for i in range(10000):
    #Forward Pass
    output = model(X_var)
    
    #Squared error
    loss = mse_loss(output, Y_var)
    
    if i%1000 == 0:
        print("loss=%s" % loss.item())
    
    #The backward pass is very simple:
    loss.backward()
    
    #Update the parameters
    for param in model.parameters():
        param.data -= learning_rate * param.grad.data
    
    model.zero_grad()



loss=1.116289734840393
loss=0.9731495976448059
loss=0.9483171105384827
loss=0.9248232841491699
loss=0.9023768305778503
loss=0.881300151348114
loss=0.8621872067451477
loss=0.8455711603164673
loss=0.8316991925239563
loss=0.820482611656189


### And now using Adagrad (torch.optim)
References on Adagrad:
- http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf
- https://cs.stanford.edu/~ppasupat/a9online/1107.html

In [21]:
from torch.autograd import Variable

data_type = torch.FloatTensor # CPU
# data_type = torch.cuda.FloatTensor # GPU

hidden_dim = 100

X_var = Variable(X.type(data_type), requires_grad=False)
Y_var = Variable(Y.type(data_type), requires_grad=False)

model = torch.nn.Sequential(
    torch.nn.Linear(in_dim, hidden_dim),
    torch.nn.Sigmoid(),
    torch.nn.Linear(hidden_dim,out_dim)
)

mse_loss = torch.nn.MSELoss()

learning_rate = 0.1

#Adagrad
optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
print(X_var[0])

for i in range(10000):
    #Forward Pass
    output = model(X_var)
    
    #Squared error
    loss = mse_loss(output, Y_var)
    
    if i%1000 == 0:
        print(output)
    
    #The backward pass is very simple:
    loss.backward()
    
    #Update the parameters
    optimizer.step()
    
    model.zero_grad()
print(X_var[0])

tensor([0.7208, 0.6609, 0.6895, 0.1177, 0.4334, 0.1400, 0.8343, 0.9763, 0.2755,
        0.5469])
tensor([0.7208, 0.6609, 0.6895, 0.1177, 0.4334, 0.1400, 0.8343, 0.9763, 0.2755,
        0.5469])
