# parametrized  linear scalar function

Let $f: \mathbb{R} \times \mathbb{R}^2 \rightarrow \mathbb{R}$ be a parametraized scalar function given by
$$
f(x; \theta) = f(x; a, b) = a x + b. \quad
$$
Its partial derivatives are
$$
\frac{\partial}{\partial x}f(x; a, b) = a, \quad 
\frac{\partial}{\partial a}f(x; a, b) = x, \quad 
\frac{\partial}{\partial b}f(x; a, b) = 1 .
$$
We want to compute the partial derivatives with respect to the parameters.

In [1]:
import torch

In [2]:
def check_gradient(x, grad_y):
    if grad_y[0] == x and \
       grad_y[1] == 1.:
        return True
    else:
        return False
    
def check_gradient_vect(x, grad_y):
    batch_size = x.shape[0]
    if (grad_y[:, 0] == x.squeeze()).all() and \
       (grad_y[:, 1] == torch.ones(batch_size)).all():
        return True
    else:
        return False

## backward() methods

1. Multiple scalar input variables (no batch input)

In [3]:
def linear(x, a, b):
    return a*x + b

create torch tensors for each function variable. Impose 'requires_grad' for the parameters which we want to differentiate with respect with.

In [4]:
x = torch.randn(1, requires_grad=False, dtype=torch.float)
a = torch.randn(1, requires_grad=True, dtype=torch.float)
b = torch.randn(1, requires_grad=True, dtype=torch.float)
y = linear(x, a, b)

print(a, a.requires_grad, a.is_leaf)
print(b, b.requires_grad, b.is_leaf)
print(x, x.requires_grad, x.is_leaf)
print(y, y.type(), y.requires_grad, y.is_leaf)

tensor([1.0473], requires_grad=True) True True
tensor([0.8933], requires_grad=True) True True
tensor([-0.6949]) False True
tensor([0.1655], grad_fn=<AddBackward0>) torch.FloatTensor True False


In [5]:
# compute gradients
y.backward()

# show partial derivatives
grad_y = torch.hstack((a.grad, b.grad))
print(a.grad == x)
print(b.grad == 1)

# reset gradients 
a.grad.zero_()
b.grad.zero_()

# show partial derivatives
print(a.grad)
print(b.grad)

# check gradient
check_gradient(x, grad_y)

tensor([True])
tensor([True])
tensor([0.])
tensor([0.])


True

2. Multiple scalar input variables (batch input)

In [6]:
# define batch size and input
batch_size = 1000
x_vect = torch.randn((batch_size, 1), requires_grad=False, dtype=torch.float)

In [7]:
%%time

# evaluate function
y = linear(x_vect, a, b)

# preallocate Jacobian matrix with respect to the parameters
grad_y = torch.empty(batch_size, 2)

for i in range(batch_size):
    
    # use vector-Jacobian product
    v = torch.eye(batch_size)[i].reshape(batch_size, 1)
    y.backward(v, retain_graph=True)
    
    # save gradients
    grad_y[i, 0] = a.grad
    grad_y[i, 1] = b.grad
    
    # reset gradients
    a.grad.zero_()
    b.grad.zero_()

CPU times: user 1.66 s, sys: 20.1 ms, total: 1.68 s
Wall time: 173 ms


In [8]:
check_gradient_vect(x_vect, grad_y)

True

## grad() mehtod

In [9]:
from torch.autograd import grad

1. Multiple scalar input variables (no batch input)

In [10]:
y = linear(x, a, b)
grad_y = grad(y, (a, b))
check_gradient(x, grad_y)

True

2. Multiple scalar input variables (batch input)

In [11]:
x = torch.randn(100, requires_grad=True)
t = torch.randn(2, requires_grad=True)
u = u(x,t)

# 1st derivatives
dt = torch.autograd.grad(u, t)[0]
dx = torch.autograd.grad(u, x, create_graph=True)[0]

# 2nd derivatives (higher orders require `create_graph=True`)
ddx = torch.autograd.grad(dx, x)[0]

NameError: name 'u' is not defined

In [12]:
y = linear(x_vect, a, b)
basis_vectors = torch.eye(batch_size)
v = basis_vectors[0]
grad(y, a)
#grad_y = grad(y, (a, b), v, retain_graph=True)
#check_gradient(x, grad_y)

RuntimeError: grad can be implicitly created only for scalar outputs

In [123]:
%%time

# evaluate function
#y = linear(x_vect, a, b)

#part_y_x1, = grad(y, (x1,))

# basis vectors
#basis_vectors = torch.eye(batch_size)

# gradient
#grad_y = torch.stack([
#    grad(y, (a, b), v, retain_graph=True)[0][i] 
#    for i, v in enumerate(basis_vectors.unbind())
#])

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


## jacobian() method

In [124]:
from torch.autograd.functional import jacobian

1. Compute all partial derivatives (no batch input)

In [130]:
jac_y_x, jac_y_a, jac_y_b = jacobian(linear, (x, a, b))
grad_y = torch.hstack((jac_y_a[0], jac_y_b[0]))
check_gradient(x, grad_y)

True

2. Compute partial derivatives with respect to the parameters (no batch input)

In [132]:
jac_y_a, jac_y_b = jacobian(lambda a,b : linear(x, a, b), (a, b))
grad_y = torch.hstack((jac_y_a[0], jac_y_b[0]))
check_gradient(x, grad_y)

True

3. Compute all partial derivatives (batch input)

In [133]:
%%time

# compute gradients
jac_y = jacobian(linear, (x_vect, a, b))
jac_y_x, jac_y_a, jac_y_b = jac_y

# show partial derivatives
print(jac_y_a.shape)
print(jac_y_b.shape)

torch.Size([1000, 1, 1])
torch.Size([1000, 1, 1])
CPU times: user 36.2 ms, sys: 19 µs, total: 36.2 ms
Wall time: 35.6 ms


In [100]:
check_gradient_vect(x_vect, torch.hstack((jac_y_a.squeeze(dim=2), jac_y_b.squeeze(dim=2))))

True

4. Compute just the partial derivatives with respect to the parameters (batch input)

In [137]:
%%time

# compute gradients
jac_y = jacobian(lambda a,b : linear(x_vect, a, b), (a, b))
jac_y_a, jac_y_b = jac_y

# show partial derivatives
print(jac_y_a.shape)
print(jac_y_b.shape)

torch.Size([1000, 1, 1])
torch.Size([1000, 1, 1])
CPU times: user 31 ms, sys: 0 ns, total: 31 ms
Wall time: 30.5 ms


In [138]:
check_gradient_vect(x_vect, torch.hstack((jac_y_a.squeeze(dim=2), jac_y_b.squeeze(dim=2))))

True

# parametrized  linear scalar function (using the nn Module)

### no batch input

In [84]:
# create linear model
d_in, d_out = 1, 1
model = nn.Linear(d_in, d_out, bias=True)
parameters = list(model.parameters())
print(type(parameters), len(parameters))

# get parameters
a = model._parameters['weight']
b = model._parameters['bias']

# evaluate model at x
x = torch.randn(d_in, requires_grad=False, dtype=torch.float)
y = model(x)

print(a, a.requires_grad, a.is_leaf)
print(b, b.requires_grad, b.is_leaf)
print(x, x.requires_grad, x.is_leaf)
print(y, y.type(), y.requires_grad, y.is_leaf)

# run .backward() + partial derivatives
y.backward()
print(a.grad == x)
print(b.grad == 1)

<class 'list'> 2
Parameter containing:
tensor([[-0.9509]], requires_grad=True) True True
Parameter containing:
tensor([-0.1825], requires_grad=True) True True
tensor([-0.3124]) False True
tensor([0.1146], grad_fn=<AddBackward0>) torch.FloatTensor True False
tensor([[True]])
tensor([True])


## batch input

### using backwards()

In [90]:
# define batch size and input
batch_size = 10
x = torch.randn((batch_size, d_in), requires_grad=False, dtype=torch.float)

# evaluate model
y = model(x)

# preallocate Jacobian matrix with respect to the coefficients
jac_y = torch.empty(batch_size, 2)

for i in range(batch_size):
    
    # use vector-Jacobian product
    v = torch.eye(batch_size)[i].reshape(batch_size, 1)
    y.backward(v, retain_graph=True)
    
    # save gradients
    jac_y[i, 0] = a.grad
    jac_y[i, 1] = b.grad
    
    # reset gradients
    a.grad.zero_()
    b.grad.zero_()

# show Jacobian
print(jac_y)
print((jac_y[:, 0] == x[:, 0]).all())
print((jac_y[:, 1] == torch.ones(batch_size)).all())
#print(jac_y[:, 0], x[:, 0])
#print(jac_y[:, 1], torch.ones(batch_size))

tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]])
tensor(False)
tensor(False)


### using jacobian()

In [91]:
# define batch size and input
batch_size = 10
x = torch.randn((batch_size, d_in), requires_grad=False, dtype=torch.float)

def partial_model(a, b):
    model._parameters['weight'] = a
    model._parameters['bias'] = b
    return model(x)

# compute gradients
jac_y = jacobian(partial_model, (a, b))
jac_y_a, jac_y_b = jac_y

# show partial derivatives
print(jac_y_a.shape)
print(jac_y_b.shape)
print((jac_y_a[:, 0, 0, 0] == x[:, 0]).all())
print((jac_y_b[:, 0, 0] == torch.ones(batch_size)).all())

torch.Size([10, 1, 1, 1])
torch.Size([10, 1, 1])
tensor(True)
tensor(True)


In [95]:
# define batch size and input
batch_size = 10
x = torch.randn((batch_size, d_in), requires_grad=False, dtype=torch.float)

# compute gradients
jac_y = jacobian(linear, (x, a, b))
jac_y_x, jac_y_a, jac_y_b = jac_y

# show partial derivatives
print(jac_y_a.shape)
print(jac_y_b.shape)
print((jac_y_a[:, 0, 0, 0] == x[:, 0]).all())
print((jac_y_b[:, 0, 0] == torch.ones(batch_size)).all())




torch.Size([10, 1, 1, 1])
torch.Size([10, 1, 1])
tensor(True)
tensor(True)
