### Given main.py

In [12]:
#!/usr/bin/env python
from __future__ import print_function
from itertools import count

import torch
import torch.autograd
import torch.nn.functional as F
from torch.autograd import Variable

POLY_DEGREE = 4
W_target = torch.randn(POLY_DEGREE, 1) * 5
b_target = torch.randn(1) * 5


def make_features(x):
    """Builds features i.e. a matrix with columns [x, x^2, x^3, x^4]."""
    x = x.unsqueeze(1)
    return torch.cat([x ** i for i in range(1, POLY_DEGREE+1)], 1)


def f(x):
    """Approximated function."""
    return x.mm(W_target) + b_target[0]


def poly_desc(W, b):
    """Creates a string description of a polynomial."""
    result = 'y = '
    for i, w in enumerate(W):
        result += '{:+.2f} x^{} '.format(w, len(W) - i)
    result += '{:+.2f}'.format(b[0])
    return result


def get_batch(batch_size=32):
    """Builds a batch i.e. (x, f(x)) pair."""
    random = torch.randn(batch_size)
    x = make_features(random)
    y = f(x)
    return Variable(x), Variable(y)


# Define model
fc = torch.nn.Linear(W_target.size(0), 1)

for batch_idx in count(1):
    # Get data
    batch_x, batch_y = get_batch()

    # Reset gradients
    fc.zero_grad()

    # Forward pass
    output = F.smooth_l1_loss(fc(batch_x), batch_y)
    loss = output.data[0]

    # Backward pass
    output.backward()

    # Apply gradients
    for param in fc.parameters():
        param.data.add_(-0.1 * param.grad.data)

    # Stop criterion
    if loss < 1e-3:
        break

print('Loss: {:.6f} after {} batches'.format(loss, batch_idx))
print('==> Learned function:\t' + poly_desc(fc.weight.data.view(-1), fc.bias.data))
print('==> Actual function:\t' + poly_desc(W_target.view(-1), b_target))

Loss: 0.000930 after 322 batches
==> Learned function:	y = +2.69 x^4 +1.16 x^3 -4.11 x^2 -2.46 x^1 -0.03
==> Actual function:	y = +2.68 x^4 +1.30 x^3 -4.10 x^2 -2.50 x^1 -0.09


## Answer 2a

In [50]:
#!/usr/bin/env python
from __future__ import print_function
from itertools import count

import torch
import torch.autograd
import torch.nn.functional as F
from torch.autograd import Variable

torch.manual_seed(123)

POLY_DEGREE = 4
W_target = torch.randn(POLY_DEGREE, 1) * 5
b_target = torch.randn(1) * 5


def make_features(x):
    """Builds features i.e. a matrix with columns [x, x^2, x^3, x^4]."""
    x = x.unsqueeze(1)
    return torch.cat([x ** i for i in range(1, POLY_DEGREE+1)], 1)


def f(x):
    """Approximated function."""
    return x.mm(W_target) + b_target[0]


def poly_desc(W, b):
    """Creates a string description of a polynomial."""
    result = 'y = '
    for i, w in enumerate(W):
        result += '{:+.2f} x^{} '.format(w, len(W) - i)
    result += '{:+.2f}'.format(b[0])
    return result



def get_batch(batch_size=32):
    """Builds a batch i.e. (x, f(x)) pair."""
    random = torch.randn(batch_size)
    x = make_features(random)
    y = f(x)
    return Variable(x), Variable(y)

def run(rate):
    # Define model
    fc = torch.nn.Linear(W_target.size(0), 1)

    # Define optimizer
    optimizer = torch.optim.SGD(fc.parameters(), lr=rate)

    for batch_idx in count(1):
        # Get data
        batch_x, batch_y = get_batch()

        # Reset gradients
        optimizer.zero_grad()

        # Forward pass
        output = F.smooth_l1_loss(fc(batch_x), batch_y)
        loss = output.data[0]

        # Backward pass
        output.backward()

        # Apply gradients
        optimizer.step()

        # Stop criterion
        if loss < 1e-3:
            break

    print('Loss: {:.6f} after {} batches with learning rate {}'.format(loss, batch_idx, rate))
    print('==> Learned function:\t' + poly_desc(fc.weight.data.view(-1), fc.bias.data))
    print('==> Actual function:\t' + poly_desc(W_target.view(-1), b_target))

run(0.01)
run(0.02)
run(0.05)
run(0.1)
run(0.2)
run(0.3)
run(0.5)
run(0.7)

Loss: 0.000962 after 1928 batches with learning rate 0.01
==> Learned function:	y = -0.57 x^4 +0.52 x^3 -1.84 x^2 -1.19 x^1 -5.91
==> Actual function:	y = -0.56 x^4 +0.60 x^3 -1.85 x^2 -1.20 x^1 -5.98
Loss: 0.000899 after 1011 batches with learning rate 0.02
==> Learned function:	y = -0.58 x^4 +0.54 x^3 -1.84 x^2 -1.19 x^1 -5.92
==> Actual function:	y = -0.56 x^4 +0.60 x^3 -1.85 x^2 -1.20 x^1 -5.98
Loss: 0.000743 after 477 batches with learning rate 0.05
==> Learned function:	y = -0.55 x^4 +0.52 x^3 -1.85 x^2 -1.19 x^1 -5.93
==> Actual function:	y = -0.56 x^4 +0.60 x^3 -1.85 x^2 -1.20 x^1 -5.98
Loss: 0.000685 after 438 batches with learning rate 0.1
==> Learned function:	y = -0.59 x^4 +0.61 x^3 -1.85 x^2 -1.22 x^1 -5.99
==> Actual function:	y = -0.56 x^4 +0.60 x^3 -1.85 x^2 -1.20 x^1 -5.98
Loss: 0.000616 after 3392 batches with learning rate 0.2
==> Learned function:	y = -0.62 x^4 +0.55 x^3 -1.80 x^2 -1.19 x^1 -5.97
==> Actual function:	y = -0.56 x^4 +0.60 x^3 -1.85 x^2 -1.20 x^1 -5.98

I observe that if we have learning rate too low, then it takes more batches to train the model. Also if we have learning rate too high, it takes more baches to train the model.

## Answer 2b

In [1]:
import numpy as np
import torch
import torch.autograd
import torch.nn.functional as F
from torch.autograd import Variable
from itertools import count

train_data = np.genfromtxt('qn2_data.csv',delimiter=',')
train_data = torch.from_numpy(train_data)
train_data = train_data.type(torch.FloatTensor)
train_X = train_data[:,:-1]
train_y = train_data[:,-1]

np.random.seed(0)

def get_batch(batch_size=2):
    batch_X = torch.Tensor(batch_size, train_X.size(1))
    batch_y = torch.Tensor(batch_size, 1)
    for i in range(batch_size):
        idx = np.random.randint(0, train_X.size(0))
        batch_X[i] = train_X[idx]
        batch_y[i] = train_y[idx]
    return Variable(batch_X), Variable(batch_y)

def train(rate):
    fc = torch.nn.Linear(train_X.size(1), 1)
    optimizer = torch.optim.SGD(fc.parameters(), lr=rate)
    loss = 0
    for batch_idx in count(1):
        
        batch_X, batch_y = get_batch(2)
        
        optimizer.zero_grad()
        output = F.smooth_l1_loss(fc(batch_X), batch_y)
        loss = output.data[0]
        
        output.backward()
        optimizer.step()
        
        loss = F.smooth_l1_loss(fc(Variable(train_X)), Variable(train_y)).data[0]
        if loss < 0.6:
            break
    print('Loss: {:.6f} after {} batches with learning rate {}'.format(loss, batch_idx, rate))
    weights = fc.weight.data.view(-1)
    bias = fc.bias.data
    print('Learned weights: {:.6f} {:.6f} and bias: {:.6f}'.format(weights[0], weights[1], bias[0]))
    return fc

model = train(0.01)
test_set = torch.Tensor([[6,4],[10,5],[14,8]])
test_set = Variable(test_set)
test_y = model(test_set)
print(test_y.data)

Loss: 0.599692 after 24992 batches with learning rate 0.01
Learned weights: 0.812789 0.945869 and bias: 31.062693

 39.7229
 43.9199
 50.0087
[torch.FloatTensor of size 3x1]



#### Weights
0.812789
0.945869

#### Bias
31.062693

#### Predictions


| Fertilizer | Insecticide | Corn produced |
|---|---|---|
| 6 | 4 | 39.7229 |
| 10 | 5 | 43.9199 |
| 14 | 8 | 50.0087 |

## Answer 2c

In [64]:
import numpy as np
import torch
import torch.autograd
import torch.nn.functional as F
from torch.autograd import Variable
from itertools import count

train_data = np.genfromtxt('qn2_data.csv',delimiter=',')
train_data = torch.from_numpy(train_data)
train_data = train_data.type(torch.FloatTensor)
X = train_data[:,:-1]
y = train_data[:,-1]

X = torch.cat((X, torch.ones(X.size(0), 1)), dim=1)

theta = torch.mm(torch.mm(X.transpose(0, 1), X).inverse(), torch.mm(X.transpose(0, 1), y.unsqueeze(1)))
theta1 = theta.view(-1)
print('Learned weights: {:.6f} {:.6f} and bias: {:.6f}'.format(theta1[0], theta1[1], theta1[2]))

test_X = torch.Tensor([[6,4],[10,5],[14,8]])
test_X = torch.cat((test_X, torch.ones(test_X.size(0), 1)), dim=1)
print(torch.mm(test_X, theta))

Learned weights: 0.650083 1.109883 and bias: 31.980692

 40.3207
 44.0309
 49.9609
[torch.FloatTensor of size 3x1]



#### Weights
0.650083
1.109883

#### Bias
31.980692

#### Predictions


| Fertilizer | Insecticide | Corn produced |
|---|---|---|
| 6 | 4 | 40.3207 |
| 10 | 5 | 44.0309 |
| 14 | 8 | 49.9609 |

The weights and bias learnt by the linear model are approximately same to that of least squares solution. Even the predictions are similar to that of least squares solution. The first weight learned is higher than that of least squares and the second weight learned is lower than that of least squares. The bias is approximately same.