# MAML - MODEL-AGNOSTIC META-LEARNING

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd drive/MyDrive/'Colab Notebooks/MetaLearning'
!ls

/content/drive/MyDrive/Colab Notebooks/MetaLearning
l2lutils.ipynb	models.ipynb  nb1.ipynb  nb2-CNP.ipynb	nb3.ipynb  utils.ipynb


In [4]:
!pip install import_ipynb --quiet
!pip install learn2learn --quiet

In [5]:
import import_ipynb
import utils
import models
# import l2lutils
utils.hide_toggle('Imports 1')

importing Jupyter notebook from utils.ipynb
importing Jupyter notebook from models.ipynb


In [6]:
from IPython import display
import torch
import torch.nn as nn
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt
# from l2lutils import KShotLoader
from IPython import display
utils.hide_toggle('Imports 2')

l2lutils

In [7]:
import torch
import numpy as np
import learn2learn as l2l
from learn2learn.data import *
import import_ipynb
import utils
class KShotLoader():
    def __init__(self,myds,num_tasks=1000,shots=2,ways=2,classes=None):
        self.shots = shots
        self.ways = ways
        self.myMds = l2l.data.MetaDataset(myds)
        if classes == None:
            n_classes = len(set(myds.labels))
            classes = [i for i in range(n_classes)]
        self.my_tasks = l2l.data.TaskDataset(self.myMds, task_transforms=[
                                l2l.data.transforms.FilterLabels(self.myMds,classes),
                                l2l.data.transforms.NWays(self.myMds,ways),
                                l2l.data.transforms.KShots(self.myMds,2*shots),
                                l2l.data.transforms.LoadData(self.myMds),
                                l2l.data.transforms.RemapLabels(self.myMds),
                                l2l.data.transforms.ConsecutiveLabels(self.myMds)
                                ],num_tasks=num_tasks)
    def get_task(self):
        data,labels = self.my_tasks.sample()
        adaptation_indices = np.zeros(data.size(0), dtype=bool)
        adaptation_indices[np.arange(self.shots*self.ways) * 2] = True
        evaluation_indices = torch.from_numpy(~adaptation_indices)
        adaptation_indices = torch.from_numpy(adaptation_indices)
        adaptation_data, adaptation_labels = data[adaptation_indices], labels[adaptation_indices]
        evaluation_data, evaluation_labels = data[evaluation_indices], labels[evaluation_indices]
        d_train = (adaptation_data,adaptation_labels)
        d_test = (evaluation_data,evaluation_labels)
        return d_train, d_test

# Pre-trained Models

In [8]:
#Generate data - euclidean
meta_train_ds, meta_test_ds, full_loader = utils.euclideanDataset(n_samples=10000,n_features=20,n_classes=10,batch_size=32)

In [9]:
# Define an MLP network. Note that input dimension has to be data dimension. For classification
# final dimension has to be number of classes; for regression one.
#torch.manual_seed(10)
net = models.MLP(dims=[20,32,32,10])

In [10]:
# Train the network; note that network is trained in place so repeated calls further train it.
net,loss,accs=models.Train(net,full_loader,lr=1e-2,epochs=5,verbose=True)

Epoch   4 Loss: 1.37655e-01 Accuracy: 0.96995


In [11]:
#Training accuracy.
models.accuracy(net,meta_train_ds.samples,meta_train_ds.labels,verbose=True)

7287.0 7500


0.9716

In [12]:
# Test accuracy.
models.accuracy(net,meta_test_ds.samples,meta_test_ds.labels)

2394.0 2500


0.9576

# Second-order Differentiation using Autograd

Second-order derivatives as needed for MAML

In [13]:
network = (lambda x,w: x@w)
loss = torch.nn.MSELoss()

In [14]:
Z=(torch.ones(3,1)).float()
z=(torch.ones(3,1)*2).float()

In [15]:
Zt=(torch.ones(3,1)*1.5).float()
zt=(torch.ones(3,1)*2*1.5).float()

In [16]:
w0=(torch.ones(1,1,requires_grad=True)).float()

In [17]:
w1=w0.clone()

In [18]:
L=loss(network(Z,w1),z)

In [19]:
#g=torch.autograd.grad(L,w0)[0]
g=torch.autograd.grad(L,w1,create_graph=True)[0]
#L.backward(create_graph=True)# Not good

In [20]:
w1.grad, w0.grad, L, w0, w1,w1.requires_grad,g

  """Entry point for launching an IPython kernel.


(None,
 None,
 tensor(1., grad_fn=<MseLossBackward>),
 tensor([[1.]], requires_grad=True),
 tensor([[1.]], grad_fn=<CloneBackward>),
 True,
 tensor([[-2.]], grad_fn=<TBackward>))

In [21]:
w1 = w1 - 0.1*g

In [22]:
L1=loss(network(Zt,w1),zt)
#L1=loss(net(Zt,w0-0.1*(2.0*(w0-2.0))),zt)

In [23]:
# Both OK - latter used with optimizer.step()
g1=torch.autograd.grad(L1,w0)[0]
#L1.backward()

In [24]:
g1

tensor([[-2.8800]])

Working this out manually:

$w_0=1, L=(w_0-2)^2, dL=2\times(w_0-2)=-2,w_1=w_0-0.1\times(-2)=1.2$

$L_1=(w_1\times1.5-3)^2 = (w_0-0.1\times(2\times(w_0-2))\times1.5-3)^2 = (-1.2)^2$

$dL_1 = 2 \times (-1.2) \times (1.5 \times (1-.2)$

In [25]:
2*(-1.2)*(1.5*(1-.2))

-2.8800000000000003

In [26]:
w0.grad,w1.grad

  """Entry point for launching an IPython kernel.


(None, None)

# Meta-Learning: Tasks

Generate a k-shot n-way loader using the meta-training dataset

In [27]:
classes_train = [i for i in range(5)]
classes_test = [i+5 for i in range(5)]
classes_train, classes_test

([0, 1, 2, 3, 4], [5, 6, 7, 8, 9])

In [28]:
meta_train_kloader=KShotLoader(meta_train_ds,shots=2,ways=5)

Sample a task - each task has a k-shot n-way training set and a similar test set

In [29]:
d_train,d_test=meta_train_kloader.get_task()

In [30]:
d_test[1]

tensor([0, 0, 4, 4, 2, 2, 3, 3, 1, 1])

Let's try directly learning using the task training set albeit its small size: create a dataset and loader and train it with the earlier network and Train function.

In [31]:
taskds = utils.MyDS(d_train[0],d_train[1])

In [32]:
d_train_loader = torch.utils.data.DataLoader(dataset=taskds,batch_size=1,shuffle=True)

In [33]:
net,losses,accs=models.Train(net,d_train_loader,lr=1e-1,epochs=10,verbose=True)

Epoch   9 Loss: 1.13965e-02 Accuracy: 1.00000


How does it do on the test set of the sampled task?

In [34]:
models.accuracy(net,d_test[0],d_test[1])

8.0 10


0.8

# MAML - Model-Agnostic Meta-Learning

In [35]:
import learn2learn as l2l
import torch.optim as optim

In [36]:
maml = l2l.algorithms.MAML(net, lr=1e-1)
optimizer = optim.Adam(net.parameters(),lr=1e-3)
lossfn = torch.nn.NLLLoss()

The MAML class above wraps our nn.Module class for parameter cloning and other purposes as below. One iteration of the MAML algorithm proceeds by first sampling a training task: Note that each of d_train and d_test is a tuple comprising of a training set, and labels.

In [37]:
d_train,d_test=meta_train_kloader.get_task()

In [38]:
learner = maml.clone()

The learner class above is a 'clone' of our network with copies of parameters so that we can change these without changing the parameters of the network. We apply the learner on training data of d_train and compute TRAINING loss w.r.t the training data of the task, i.e., d_train.

In [39]:
train_preds = learner(d_train[0])
train_loss = lossfn(train_preds,d_train[1])

In [40]:
net.layers[0].weight

Parameter containing:
tensor([[-0.0170,  0.2050, -0.1767, -0.0742,  0.0689, -0.4018, -0.1227, -0.0731,
          0.2030, -0.0670, -0.0181,  0.0241, -0.3519, -0.3282,  0.1709,  0.0905,
          0.2800,  0.1334,  0.2410, -0.3506],
        [-0.0458, -0.1684,  0.1120,  0.2084, -0.2014,  0.1386,  0.1588,  0.0160,
          0.3254, -0.1710,  0.0373,  0.2474,  0.1794,  0.1083, -0.0437,  0.2532,
          0.3755, -0.0917,  0.0109, -0.0544],
        [ 0.1411, -0.1818, -0.1080, -0.0474,  0.2119,  0.0112, -0.0047, -0.2275,
         -0.1094,  0.1886,  0.0798, -0.0432,  0.1764, -0.1920,  0.3518, -0.1561,
         -0.3709,  0.0619,  0.0253,  0.0599],
        [ 0.1448, -0.2944,  0.2867, -0.1107, -0.1414, -0.0796, -0.1252, -0.1100,
         -0.1379, -0.1652,  0.1786,  0.0779,  0.1500, -0.2978, -0.0799,  0.1574,
          0.2165, -0.0510, -0.1667,  0.1264],
        [ 0.3457, -0.3076,  0.0544,  0.0358,  0.0608,  0.1720, -0.0316, -0.1716,
         -0.0502, -0.1075, -0.0986,  0.1823, -0.1532,  0.1865, -0

In [41]:
learner.layers[0].weight

tensor([[-0.0170,  0.2050, -0.1767, -0.0742,  0.0689, -0.4018, -0.1227, -0.0731,
          0.2030, -0.0670, -0.0181,  0.0241, -0.3519, -0.3282,  0.1709,  0.0905,
          0.2800,  0.1334,  0.2410, -0.3506],
        [-0.0458, -0.1684,  0.1120,  0.2084, -0.2014,  0.1386,  0.1588,  0.0160,
          0.3254, -0.1710,  0.0373,  0.2474,  0.1794,  0.1083, -0.0437,  0.2532,
          0.3755, -0.0917,  0.0109, -0.0544],
        [ 0.1411, -0.1818, -0.1080, -0.0474,  0.2119,  0.0112, -0.0047, -0.2275,
         -0.1094,  0.1886,  0.0798, -0.0432,  0.1764, -0.1920,  0.3518, -0.1561,
         -0.3709,  0.0619,  0.0253,  0.0599],
        [ 0.1448, -0.2944,  0.2867, -0.1107, -0.1414, -0.0796, -0.1252, -0.1100,
         -0.1379, -0.1652,  0.1786,  0.0779,  0.1500, -0.2978, -0.0799,  0.1574,
          0.2165, -0.0510, -0.1667,  0.1264],
        [ 0.3457, -0.3076,  0.0544,  0.0358,  0.0608,  0.1720, -0.0316, -0.1716,
         -0.0502, -0.1075, -0.0986,  0.1823, -0.1532,  0.1865, -0.0009,  0.1407,
      

Note that at this point both the learner and original net have the same parameters. Lets see what the gradients w.r.t the TRAINING loss are: (We use pytorch's autograd functions directly.)

In [42]:
from torch.autograd import grad

In [43]:
train_grad=grad(train_loss,learner.layers[0].weight,retain_graph=True,
                                 create_graph=True,
                                 allow_unused=True)
train_grad[0]

tensor([[-1.5369e-01,  7.6493e-03,  5.1484e-02, -4.8598e-03, -2.5352e-01,
         -1.6761e-01,  1.7654e-02,  4.2795e-02,  1.6681e-01,  6.7950e-02,
         -6.7698e-02,  5.0039e-02, -6.7161e-02,  1.2873e-01, -3.8113e-02,
          1.0247e-02,  7.2580e-03, -1.1101e-01,  1.0865e-02,  2.7140e-02],
        [-2.5535e-02, -3.3880e-02, -3.9947e-03,  2.8470e-02, -4.4797e-02,
         -9.4264e-02,  6.8652e-03, -8.0587e-02,  5.1062e-02,  1.5032e-02,
          7.1067e-02, -4.2555e-03,  1.9587e-02,  2.0244e-02, -1.1713e-02,
          3.8650e-02,  9.0536e-03, -4.4504e-02,  1.3211e-02, -4.8954e-03],
        [-3.0367e-02, -3.7475e-01, -2.0228e-01, -4.7997e-01,  2.2323e-01,
         -2.3587e-01,  3.0371e-01, -2.5351e-01, -3.3591e-02, -4.4211e-02,
          5.5854e-03,  5.6466e-02,  1.8287e-01, -3.7727e-02, -1.7420e-01,
          1.0689e-02,  5.1989e-02,  3.5636e-01, -3.1156e-02,  6.3373e-02],
        [ 2.1314e-01, -7.2168e-03, -1.1497e-01,  6.7505e-02,  2.5608e-01,
          2.1217e-01, -8.3638e-02, 

Next we ADAPT the learner by taking one step on the CLONED parameters in direction of the gradient of the TRAINING loss above. This is the part that the l2l libarary does for us as per the MAML algorithm.

In [44]:
learner.adapt(train_loss)

We can check what has happended:

In [45]:
learner.layers[0].weight

tensor([[-1.6563e-03,  2.0427e-01, -1.8190e-01, -7.3707e-02,  9.4210e-02,
         -3.8500e-01, -1.2447e-01, -7.7377e-02,  1.8631e-01, -7.3786e-02,
         -1.1373e-02,  1.9119e-02, -3.4514e-01, -3.4105e-01,  1.7467e-01,
          8.9478e-02,  2.7923e-01,  1.4451e-01,  2.3991e-01, -3.5328e-01],
        [-4.3255e-02, -1.6501e-01,  1.1236e-01,  2.0553e-01, -1.9695e-01,
          1.4800e-01,  1.5808e-01,  2.4056e-02,  3.2032e-01, -1.7247e-01,
          3.0225e-02,  2.4786e-01,  1.7745e-01,  1.0623e-01, -4.2480e-02,
          2.4931e-01,  3.7455e-01, -8.7212e-02,  9.5304e-03, -5.3922e-02],
        [ 1.4414e-01, -1.4435e-01, -8.7815e-02,  6.0119e-04,  1.8960e-01,
          3.4824e-02, -3.5071e-02, -2.0214e-01, -1.0607e-01,  1.9299e-01,
          7.9197e-02, -4.8808e-02,  1.5816e-01, -1.8823e-01,  3.6926e-01,
         -1.5719e-01, -3.7613e-01,  2.6285e-02,  2.8384e-02,  5.3519e-02],
        [ 1.2346e-01, -2.9373e-01,  2.9821e-01, -1.1749e-01, -1.6697e-01,
         -1.0081e-01, -1.1680e-01, 

In [46]:
(net.layers[0].weight - learner.layers[0].weight)/train_grad[0]

tensor([[0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000,
         0.1000, 0.1000],
        [0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1000, 0.1

So one step in the diretion of the gradient (w.r.t train_loss) has been taken. Next we compute the loss of this ADAPTED learner w.r.t. the TEST data of the task, i.e., d_test:

In [47]:
test_preds = learner(d_test[0])
adapt_loss = lossfn(test_preds,d_test[1])

The main MAML update to the original network net takes place now, by back-propagating through the (cumulative) adaptation loss (across possibly many tasks, here there was just one):

In [48]:
task_count = 1
optimizer.zero_grad()
total_loss = adapt_loss/task_count
total_loss.backward()

In [49]:
net.layers[0].weight

Parameter containing:
tensor([[-0.0170,  0.2050, -0.1767, -0.0742,  0.0689, -0.4018, -0.1227, -0.0731,
          0.2030, -0.0670, -0.0181,  0.0241, -0.3519, -0.3282,  0.1709,  0.0905,
          0.2800,  0.1334,  0.2410, -0.3506],
        [-0.0458, -0.1684,  0.1120,  0.2084, -0.2014,  0.1386,  0.1588,  0.0160,
          0.3254, -0.1710,  0.0373,  0.2474,  0.1794,  0.1083, -0.0437,  0.2532,
          0.3755, -0.0917,  0.0109, -0.0544],
        [ 0.1411, -0.1818, -0.1080, -0.0474,  0.2119,  0.0112, -0.0047, -0.2275,
         -0.1094,  0.1886,  0.0798, -0.0432,  0.1764, -0.1920,  0.3518, -0.1561,
         -0.3709,  0.0619,  0.0253,  0.0599],
        [ 0.1448, -0.2944,  0.2867, -0.1107, -0.1414, -0.0796, -0.1252, -0.1100,
         -0.1379, -0.1652,  0.1786,  0.0779,  0.1500, -0.2978, -0.0799,  0.1574,
          0.2165, -0.0510, -0.1667,  0.1264],
        [ 0.3457, -0.3076,  0.0544,  0.0358,  0.0608,  0.1720, -0.0316, -0.1716,
         -0.0502, -0.1075, -0.0986,  0.1823, -0.1532,  0.1865, -0

In [50]:
optimizer.step()

In [51]:
net.layers[0].weight

Parameter containing:
tensor([[-1.6025e-02,  2.0403e-01, -1.7575e-01, -7.3193e-02,  6.9858e-02,
         -4.0276e-01, -1.2170e-01, -7.4097e-02,  2.0399e-01, -6.7991e-02,
         -1.7143e-02,  2.3123e-02, -3.5086e-01, -3.2717e-01,  1.6986e-01,
          9.1503e-02,  2.8096e-01,  1.3241e-01,  2.3999e-01, -3.4956e-01],
        [-4.6808e-02, -1.6940e-01,  1.1296e-01,  2.0938e-01, -2.0242e-01,
          1.3758e-01,  1.5976e-01,  1.4998e-02,  3.2643e-01, -1.6996e-01,
          3.8332e-02,  2.4843e-01,  1.8040e-01,  1.0725e-01, -4.2651e-02,
          2.5418e-01,  3.7645e-01, -9.2662e-02,  1.1851e-02, -5.5411e-02],
        [ 1.4210e-01, -1.8082e-01, -1.0904e-01, -4.6396e-02,  2.1092e-01,
          1.2237e-02, -5.7000e-03, -2.2649e-01, -1.1043e-01,  1.8757e-01,
          8.0756e-02, -4.4161e-02,  1.7744e-01, -1.9300e-01,  3.5284e-01,
         -1.5712e-01, -3.7193e-01,  6.0922e-02,  2.6269e-02,  5.8857e-02],
        [ 1.4378e-01, -2.9545e-01,  2.8771e-01, -1.0974e-01, -1.4036e-01,
         -8.0

So, the original parameters have been updated by a gradient step using on all the task adaptation losses. 

# Putting it all together: MAML Algorithm
Now let's put all of the above in a loop - the MAML algorithm:

In [52]:
import learn2learn as l2l
import torch.optim as optim
classes_train = [i for i in range(5)]
classes_test = [i+3 for i in range(5)]
classes_train, classes_test
shots,ways = 5,2
net = models.MLP(dims=[20,64,32,ways])
#net = models.RNN(n_classes=3,dim=10,n_layers=2)
maml = l2l.algorithms.MAML(net, lr=1e-2)
optimizer = optim.Adam(maml.parameters(),lr=5e-3)
lossfn = torch.nn.NLLLoss()
meta_train_kloader=KShotLoader(meta_train_ds,shots=shots,ways=ways,num_tasks=1000)

In [53]:
# Number of epochs, tasks per step and number of fast_adaptation steps 
n_epochs=50
task_count=32
fas = 5

Note: In practice we use more than one gradient step for adpation, this is called 'fast adaptation'.

In [54]:
epoch=0
while epoch<n_epochs:
    adapt_loss = 0.0
    test_acc = 0.0
    # Sample and train on a task
    for task in range(task_count):
        d_train,d_test=meta_train_kloader.get_task()
        learner = maml.clone()
        for fas_step in range(fas):
            train_preds = learner(d_train[0])
            train_loss = lossfn(train_preds,d_train[1])
            learner.adapt(train_loss)
        test_preds = learner(d_test[0])
        adapt_loss += lossfn(test_preds,d_test[1])
        learner.eval()
        test_acc += models.accuracy(learner,d_test[0],d_test[1],verbose=False)
        learner.train()
        # Done with a task
    # Update main network
    print('Epoch  % 2d Loss: %2.5e Avg Acc: %2.5f'%(epoch,adapt_loss/task_count,test_acc/task_count))
    display.clear_output(wait=True)
    optimizer.zero_grad()
    total_loss = adapt_loss
    total_loss.backward()
    optimizer.step()
    epoch+=1
    

Epoch   49 Loss: 9.39257e-02 Avg Acc: 0.97187


In [57]:
# for i in range(1,5):
#   print('Epoch  % d'%(i))
#   display.clear_output(wait=True)

Epoch   4


Now test the trained maml network and applying the adaption step to tasks sampled from the meta_test_ds dataset:

In [55]:
meta_test_kloader=KShotLoader(meta_test_ds,shots=shots,ways=ways)
test_acc = 0.0
task_count = 20
adapt_steps = 5
maml.eval()
# Sample and train on a task
for task in range(task_count):
    d_train,d_test=meta_test_kloader.get_task()
    learner = maml.clone()
    learner.eval()
    for adapt_step in range(adapt_steps):
        train_preds = learner(d_train[0])
        train_loss = lossfn(train_preds,d_train[1])
        learner.adapt(train_loss)
    test_preds = learner(d_test[0])
    test_acc += models.accuracy(learner,d_test[0],d_test[1],verbose=False)
    # Done with a task
learner.train()
print('Avg Acc: %2.5f'%(test_acc/task_count))

Avg Acc: 0.96500
