# The code here is the implementation of the Single Neuron Neural Network defined from: https://www.overleaf.com/project/63992e2c41d53fa75e5f7398


## Importing the packages

In [1]:
import torch
import numpy as np
import pdb
from torch.autograd import grad

## Defining the network
We define a single neuron neural network with initial weight $\theta=0$. 

In [2]:
class SingleNet():
  def __init__(self):
    theta = torch.tensor([1.], requires_grad=True) # task specific initialization
    self.weight = theta # set initial weight to 1
  
  def __call__(self, x):
    return self.weight * x

In [3]:
## TEST ## 
# we test if the neuron works x=5 f_w=w*x=1*5=5
net = SingleNet()
print(net(5))

tensor([5.], grad_fn=<MulBackward0>)


Defining dataset

---

Tables:
```
D_1 | x   y  |$f_\theta$ 
----|------------------
Q 1 | 1   2  |- 
S 1 | 2   4  |2 
    | 3   1  |3 

```

```
D_2 | x   y  |$f_\theta$ 
----|------------------
Q 2 | 4   1  |- 
S 2 | 5   3  |5 
    | 6   0  |6 
```

In [4]:
# We define 2 datasets for our case. 
D1 = {'query': torch.tensor([(1, 2)], dtype=torch.int64), 'support': torch.tensor([(2, 4), (3, 1)])} # (x, y) pairs for query (Q1) and support (S1) set.
D2 = {'query': torch.tensor([(4, 1)], dtype=torch.int64), 'support': torch.tensor([(5, 3), (6, 0)])}
D_all = [D1, D2]
print(f"x_D1 (support set): {D1['support'][:,0]}")  #x
print(f"y_D1 (support set): {D1['support'][:,1]}")  # y


x_D1 (support set): tensor([2, 3])
y_D1 (support set): tensor([4, 1])


We define the loss function (MSE loss)

$\mathcal{L}_{S_j}(f_\theta(x),y)=\sum_{(x_i^j,y_i^j) \in S_j}{(y_i^j-\theta x_i^j)^2}$

In [5]:
def loss(weight, dataset, mode='train'):
  '''
  Regression loss over dataset
  '''
  # l1 loss
  if mode == 'train':
    data = dataset['support']
  if mode == 'test':
    data = dataset['query']
  return torch.sum((data[:, 1] - weight * data[:, 0])**2)

Then we define the gradient of loss function

$\frac{\partial \mathcal{L}_{S_j}(f_\theta)}{\partial \theta}=-2\sum_{(x_i^j,y_i^j)\in S_j}{x_i^j(y_i^j-\theta x_i^j)}$

Task-specific weight (inner) update is calculated

$\varphi_j(\theta^{(0)}) = \theta^{(0)} -\alpha 	\frac{\partial \mathcal{L}_{S_j}(f_\theta)}{\partial w} \Bigg|_{\theta=\theta^{(0)}}$

$
  = \theta^{(0)}+2\alpha \sum_{(x_i^j,y_i^j)\in S_j}{x_i^j(y_i^j-\theta^{(0)}x_i^j)}  $


where $\alpha=0.1$ is the task-specific learning rate

In [None]:
def inner_gradient(net, dataset, mode='train', weight=0, compute_autograd=False):
  if mode == 'train':
    data = dataset['support']
    weight = net.weight

  if mode == 'test':
    data = dataset['query']
    weight = net.weight
    print(f"task-specific weight: {weight}")
  if compute_autograd:
    task_specific_loss = loss(weight, dataset, mode=mode)

    # using backward here
    # task_specific_loss.backward()
    # task_specific_gradients = net.weight.grad

    # instead of using loss.backward(), use torch.autograd.grad() to compute gradients
    task_specific_gradients = grad(loss, net.weight, create_graph=True)
    print(f"First order loss grad: {task_specific_gradients}")  # need to be converted into orch.tensor
  else: 
    # compute gradient manually using formula
    task_specific_gradients = -2 * torch.sum(data[:, 0] * (data[:, 1] - weight.item() * data[:, 0]))
  return task_specific_gradients

In [None]:
def inner_weight(net, dataset, mode='train', alpha=0.1, compute_autograd=True):
  '''
  Compute task-specific (inner) weights on the support set
  param net: meta model weight
  param dataset: task
  param alpha: task-specific learning rate
  '''
  loss_grad = inner_gradient(net, dataset, mode=mode, compute_autograd=compute_autograd)
  task_specific_weight = net.weight - alpha * torch.tensor(loss_grad)
  return task_specific_weight

In [None]:
w1 = inner_weight(net, D1)
print(f"task sepcific weight (w'1) of D1: {w1}")

w2 = inner_weight(net, D2)
print(f"task sepcific weight (w'2) of D2: {w2}")

First order loss grad: tensor([102.])
task sepcific weight (w'1) of D1: tensor([-9.2000], grad_fn=<SubBackward0>)
First order loss grad: tensor([194.])
task sepcific weight (w'2) of D2: tensor([-18.4000], grad_fn=<SubBackward0>)


  task_specific_weight = net.weight - alpha * torch.tensor(loss_grad)


The meta gradient is calculated based on summation of gradient of loss over query set of each task: 

$\sum_{j} \mathcal{L}_{Q_j}(f_{\varphi_j(\theta^{(0)})})$

which can be calculated using chain rule:

$\frac{\partial \mathcal{L}_{Q_j}(f_{\varphi_j(\theta^{(0)})})}{\partial \theta^{(0)}} = \underbrace{\frac{\partial \varphi_j(\theta^{(0)})}{\partial \theta^{(0)}}}_{\circ}\underbrace{\frac{\partial \mathcal{L}_{Q_j}(f_{\varphi_j(\theta^{(0)})})}{\partial \varphi_j(\theta^{(0)})}}_{\star}$

$\frac{\partial \mathcal{L}_{Q_j}(f_{\varphi_j(\theta^{(0)})})}{\partial \theta^{(0)}} = \underbrace{\left(1-2\alpha\sum_{(x_q^j, y_q^j)\in Q_j}{\left( x_q^j\right)^2}\right)}_{eq. 14}\underbrace{\left(-2\sum_{(x_q^j, y_q^j)\in Q_j}{x_q^j(y_q^j-{\varphi_j(\theta^{(0)})}x_q^j)}\right)}_{eq. 13}$

In [None]:
def meta_gradient(net, dataset, weight, mode='test', alpha=0.1, compute_autograd=False):
  '''
  Manual calculation of gradient (NO COMPUTE_AUTOGRAD PARAM)
  eqn 10
  '''
  if mode == 'train':
    data = dataset['support']
  if mode == 'test':
    data = dataset['query']
  # eqn 8
  """if compute_autograd == True:
    #use_autograd = True
    eq8 = inner_weight(net, dataset, alpha=0.1, compute_autograd = True )"""

  # use inner_gradient to handle everything.
  loss_grad = inner_gradient(net, dataset, mode=mode, weight=weight,compute_autograd=compute_autograd)
    
  if compute_autograd == True: #TC
    #pdb.set_trace()
    loss_grad = inner_gradient(net, dataset, mode=mode, weight=weight,compute_autograd=True)
    print(f"{loss_grad = }")

    eq8 = torch.tensor(loss_grad)
    eq9 = torch.tensor([1]) - 2 * alpha * torch.tensor(data[:, 0]**2)

  else: #TC
    eq8 = inner_gradient(net, dataset, mode=mode, weight=weight)
    print(f"Eq: 13: {eq8:}")
    # eqn 9
    eq9 = torch.tensor([1]) - 2 * alpha * torch.sum(data[:, 0]**2)
    print(f"Eq: 14: {eq9:}")

  new_meta_gradient = eq8 * eq9

  return new_meta_gradient

So the meta gradients for task 1 $D_1$ and task 2 $D_2$ can be calculated as follows:

In [None]:
print(f"Meta gradient for D1 (query set): {meta_gradient(net, D1, w1)}")
print(f"Meta gradient for D2 (query set): {meta_gradient(net, D2, w2)}")

task-specific weight: tensor([0.6000], grad_fn=<SubBackward0>)
Eq: 13: -2.799999952316284
Eq: 14: tensor([0.8000])
Meta gradient for D1 (query set): tensor([-2.2400])
task-specific weight: tensor([-8.2000], grad_fn=<SubBackward0>)
Eq: 13: -270.3999938964844
Eq: 14: tensor([-2.2000])
Meta gradient for D2 (query set): tensor([594.8800])


The meta weight updates

$\theta^{(1)} =\tilde{\theta}^{(0)}-\beta \frac{\partial}{\partial \theta^{(0)}} \sum_{j} \mathcal{L}_{Q_j}(f_{\varphi_j(\theta^{(0)})})\Bigg|_{\theta^{(0)}=\tilde{\theta}^{(0)}}$

$= \tilde{\theta}^{(0)}-\beta \sum_{j} \underbrace{\frac{\partial \mathcal{L}_{Q_j}(f_{\varphi_j(\theta^{(0)})})}{\partial \theta^{(0)}}}_{*}\Bigg|_{\theta^{(0)}=\tilde{\theta}^{(0)}}$

where $\beta=0.5$ is meta learning weight, and $\tilde{\theta}^{(0)}=1$

In [None]:
net = SingleNet()

In [None]:
def meta_weight(net, datasets, weight, beta=0.5, compute_autograd=True): #TC: only change compute_autograd variable here
  '''
  Compute meta (outer) weights on the query set
  param weight: weight vector of task specific weights
  '''
  # eqn 6
  # update weights
  # all_query_sets = list(map(lambda x: x['query'], datasets))
  # print(f'{all_query_sets=}')
  if compute_autograd:
    w1 = inner_weight(net, datasets[0], mode='train')
    # w2 = inner_weight(net, datasets[1], mode='train')
    print(f'{w1=}')
    # loss_1 = loss(w1, datasets[0], mode='test')
    loss_1 = loss(w1, datasets[0], mode='test')
    # loss_2 = loss(w2, datasets[1], mode='test')

    print(f'{loss_1=}')
    loss_1.backward()
    

    print(net.weight.grad)
  else:
    # MANUAL CALCULATION USING META GRADIENT
    all_meta_gradient = torch.tensor(list((map(lambda i: meta_gradient(net, datasets[i], weight=weight[i], mode='test'), list(range(len(datasets)))))))
    print(f"Gradients of loss over query sets: {all_meta_gradient}")
    net.weight = net.weight - (beta * torch.sum(all_meta_gradient))
    print(f"Updated meta model weight after one gradient step: {net.weight}")

In [None]:
meta_weight(net, D_all, [w1, w2])

First order loss grad: tensor([4.])
w1=tensor([0.6000], grad_fn=<SubBackward0>)
First order loss grad: tensor([8.])
loss_1=tensor(3.2400, grad_fn=<SumBackward0>)
tensor([4.4000])


  task_specific_weight = net.weight - alpha * torch.tensor(loss_grad)


In [None]:
meta_weight(net, D_all, [w1, w2])

task-specific weight: tensor([0.6000], grad_fn=<SubBackward0>)
Eq: 13: -2.799999952316284
Eq: 14: tensor([0.8000])
task-specific weight: tensor([-8.2000], grad_fn=<SubBackward0>)
Eq: 13: -270.3999938964844
Eq: 14: tensor([-2.2000])
Gradients of loss over query sets: tensor([ -2.2400, 594.8800])
Updated meta model weight after one gradient step: tensor([-295.3200], grad_fn=<SubBackward0>)


## Autograd: Single Neuron Neural Netwrok model weight update using autograd

The followings show the MAML model using autograd (automatic grdaient calculation)

In [None]:
# we define a new net model 
net2 = SingleNet()
net2.weight

tensor([1.], requires_grad=True)

Compute task-specific weight using autograd

* Compute the loss using `loss()` function $$\mathcal{L}_{S_j}(f_\theta(x),y)=\sum_{(x_i^j,y_i^j) \in S_j}{(y_i^j-\theta x_i^j)^2}$$
* Compute gradient of loss using autograd: $$\frac{\partial \mathcal{L}_{S_j}(f_\theta)}{\partial \theta}=-2\sum_{(x_i^j,y_i^j)\in S_j}{x_i^j(y_i^j-\theta x_i^j)}$$
* Calculate the task-specific weights ($\varphi_j$)

Initial network weight $\theta^{(0)}=1$

In [None]:
print(f"Gradient of loss for D1 (autograd): {inner_gradient(net2, D1,compute_autograd = True)}")
print(f"Gradient of loss for D2 (autograd): {inner_gradient(net2, D2,compute_autograd= True)}")

First order loss grad: (tensor([4.], grad_fn=<SumBackward1>),)
Gradient of loss for D1 (autograd): (tensor([4.], grad_fn=<SumBackward1>),)
First order loss grad: (tensor([92.], grad_fn=<SumBackward1>),)
Gradient of loss for D2 (autograd): (tensor([92.], grad_fn=<SumBackward1>),)


In [None]:
#ww=net2.weight.detach()
w1=inner_weight(net2, D1, alpha=0.1, compute_autograd = True)
print(f"{w1 =}")

w2=inner_weight(net2, D2, alpha=0.1, compute_autograd = True)
print(f"{w2 =}")

First order loss grad: (tensor([4.], grad_fn=<SumBackward1>),)
w1 =tensor([0.6000], grad_fn=<SubBackward0>)
First order loss grad: (tensor([92.], grad_fn=<SumBackward1>),)
w2 =tensor([-8.2000], grad_fn=<SubBackward0>)


In [None]:
tw1 = torch.tensor(w1, requires_grad = True)
tw2 = torch.tensor(w2, requires_grad = True)

  tw1= torch.tensor(w1, requires_grad = True)
  tw2 =torch.tensor(w2, requires_grad = True)


In [None]:
print(f"Meta gradient for D1 (query set) using autograd: {meta_gradient(net2, D1, tw1, compute_autograd = True)}")
print(f"Meta gradient for D2 (query set) using autograd: {meta_gradient(net2, D2, tw2, compute_autograd = True)}")

task-specific weight: tensor([0.6000], grad_fn=<SubBackward0>)
First order loss grad: (tensor([-2.], grad_fn=<MulBackward0>),)
loss_grad = (tensor([-2.], grad_fn=<MulBackward0>),)
Meta gradient for D1 (query set) using autograd: tensor([-1.6000])
task-specific weight: tensor([-8.2000], grad_fn=<SubBackward0>)
First order loss grad: (tensor([24.], grad_fn=<MulBackward0>),)
loss_grad = (tensor([24.], grad_fn=<MulBackward0>),)
Meta gradient for D2 (query set) using autograd: tensor([-52.8000])


  eq9 = torch.tensor([1]) - 2 * alpha * torch.tensor(data[:, 0]**2)


In [None]:
net2(4)

x = D2['support'][:,0]
y = D2['support'][:,1]
new_loss = torch.sum((y - net2(x))**2)

# instead of using loss.backward(), use torch.autograd.grad() to compute gradients
loss_grads = grad(new_loss, net2.weight, create_graph=True)
loss_grads

(tensor([92.], grad_fn=<SumBackward1>),)

In [None]:
d2loss =[]

drv = grad(loss_grads, net2.weight, create_graph=True)
d2loss.append(drv)
print(net2.weight, drv)

tensor([1.], requires_grad=True) (tensor([32.], grad_fn=<MulBackward0>),)


In [None]:
#ww=net2.weight.detach()
w1=inner_weight2(net2, D1, alpha=0.1, compute_autograd = True)
print(f"{w1 =}")

w2=inner_weight2(net2, D2, alpha=0.1, compute_autograd = True)
print(f"{w2 =}")

loss_grads=(tensor([4.], grad_fn=<SumBackward1>),)
w1 =tensor([0.6000], grad_fn=<SubBackward0>)
loss_grads=(tensor([92.], grad_fn=<SumBackward1>),)
w2 =tensor([-8.2000], grad_fn=<SubBackward0>)


In [None]:
weight=net2.weight
grad_s1= compute_first_order_autograd(weight, losses_s[0])
weight=net2.weight   #  question: how to reuse the initialization weight in pytorch
grad_s2= compute_first_order_autograd(weight, losses_s[1])

weight.grad=tensor([4.])
weight.grad=tensor([96.])


In [None]:
alpha = 0.1
task_specific_weight1 = net2.weight - alpha * grad_s1
task_specific_weight2 = net2.weight - alpha * grad_s2

In [None]:
print(f"{task_specific_weight1 = }")
print(f"{task_specific_weight2 = }")
net2.weight

task_specific_weight1 = tensor([-8.6000], grad_fn=<SubBackward0>)
task_specific_weight2 = tensor([-8.6000], grad_fn=<SubBackward0>)


tensor([1.], requires_grad=True)

In [None]:
net2(D1['support'][:,0])
grad_s1

tensor([96.])

In [None]:
print(f"{net2.weight=}")
print(f"{net2.weight.grad}")
rloss = loss(net2.weight,D1)
print(f"{rloss=}")
rloss.backward()
print(f"{net2.weight.grad=}")

net2.weight=tensor([1.], requires_grad=True)
None
rloss=tensor(8., grad_fn=<SumBackward0>)
net2.weight.grad=tensor([4.])


In [None]:
https://github.com/GauravIyer/MAML-Pytorch/blob/master/Experiment%201/Experiment_1_Sine_Regression.ipynb