<a href="https://colab.research.google.com/github/sarjakpatel/CMPE_297_Advanced_Deep_Learning/blob/main/Assignment%204/CMPE_297_Assignment_4_Part_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CMPE 297 Assignment 4 Part 3: Implement a colab of  metalearning on top of BERT

## Importing Libraries

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 29.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 71.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 63.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [2]:
import json
import requests
import gc
import collections
import random
import os
import torch
import numpy as np
import pickle
import time
import logging
from random import shuffle
from collections import Counter
from torch import nn
from torch.utils.data import Dataset
from torch.nn import functional as F
from transformers import BertModel, BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from torch.optim import Adam
from torch.nn import CrossEntropyLoss
from transformers import BertForSequenceClassification
from copy import deepcopy
from sklearn.metrics import accuracy_score

## Dataset

In [3]:
reviews = requests.get('https://raw.githubusercontent.com/mailong25/meta-learning-bert/master/dataset.json')
reviews = reviews.json()

reviews[:5]

[{'text': "GOOD LOOKING KICKS IF YOUR KICKIN IT OLD SCHOOL LIKE ME. AND COMFORTABLE. AND RELATIVELY CHEAP. I'LL ALWAYS KEEP A PAIR OF STAN SMITH'S AROUND FOR WEEKENDS",
  'label': 'positive',
  'domain': 'apparel'},
 {'text': 'These sunglasses are all right. They were a little crooked, but still cool..',
  'label': 'positive',
  'domain': 'apparel'},
 {'text': "I don't see the difference between these bodysuits and the more expensive ones. Fits my boy just right",
  'label': 'positive',
  'domain': 'apparel'},
 {'text': 'Very nice basic clothing. I think the size is fine. I really like being able to find these shades of green, though I have decided the lighter shade is really a feminine color. This is the only brand that I can find these muted greens',
  'label': 'positive',
  'domain': 'apparel'},
 {'text': 'I love these socks. They fit great (my 15 month old daughter has thick ankles) and she can zoom around on the kitchen floor and not take a nose dive into things. :',
  'label': 'p

In [4]:
mention_domain = [r['domain'] for r in reviews]
Counter(mention_domain)

Counter({'apparel': 1717,
         'baby': 1107,
         'beauty': 993,
         'books': 921,
         'camera_&_photo': 1086,
         'cell_phones_&_service': 698,
         'dvd': 893,
         'electronics': 1277,
         'grocery': 1100,
         'health_&_personal_care': 1429,
         'jewelry_&_watches': 1086,
         'kitchen_&_housewares': 1390,
         'magazines': 1133,
         'music': 1007,
         'outdoor_living': 980,
         'software': 1029,
         'sports_&_outdoors': 1336,
         'toys_&_games': 1363,
         'video': 1010,
         'automotive': 100,
         'computer_&_video_games': 100,
         'office_products': 100})

In [5]:
LABEL_MAP  = {'positive':0, 'negative':1, 0:'positive', 1:'negative'}

In [6]:
class MetaTask(Dataset):
    
    def __init__(self, examples, num_task, k_support, k_query, tokenizer):
        """
        :param samples: list of samples
        :param num_task: number of training tasks.
        :param k_support: number of support sample per task
        :param k_query: number of query sample per task
        """
        self.examples = examples
        random.shuffle(self.examples)
        
        self.num_task = num_task
        self.k_support = k_support
        self.k_query = k_query
        self.tokenizer = tokenizer
        self.max_seq_length = 128
        self.create_batch(self.num_task)
    
    def create_batch(self, num_task):
        self.supports = []  # support set
        self.queries = []  # query set
        
        for b in range(num_task):  # for each task
            # 1.select domain randomly
            domain = random.choice(self.examples)['domain']
            domainExamples = [e for e in self.examples if e['domain'] == domain]
            
            # 1.select k_support + k_query examples from domain randomly
            selected_examples = random.sample(domainExamples,self.k_support + self.k_query)
            random.shuffle(selected_examples)
            exam_train = selected_examples[:self.k_support]
            exam_test  = selected_examples[self.k_support:]
            
            self.supports.append(exam_train)
            self.queries.append(exam_test)

    def create_feature_set(self,examples):
        all_input_ids      = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_attention_mask = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_segment_ids    = torch.empty(len(examples), self.max_seq_length, dtype = torch.long)
        all_label_ids      = torch.empty(len(examples), dtype = torch.long)

        for id_,example in enumerate(examples):
            input_ids = tokenizer.encode(example['text'])
            attention_mask = [1] * len(input_ids)
            segment_ids    = [0] * len(input_ids)

            while len(input_ids) < self.max_seq_length:
                input_ids.append(0)
                attention_mask.append(0)
                segment_ids.append(0)

            label_id = LABEL_MAP[example['label']]
            all_input_ids[id_] = torch.Tensor(input_ids).to(torch.long)
            all_attention_mask[id_] = torch.Tensor(attention_mask).to(torch.long)
            all_segment_ids[id_] = torch.Tensor(segment_ids).to(torch.long)
            all_label_ids[id_] = torch.Tensor([label_id]).to(torch.long)

        tensor_set = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)  
        return tensor_set
    
    def __getitem__(self, index):
        support_set = self.create_feature_set(self.supports[index])
        query_set   = self.create_feature_set(self.queries[index])
        return support_set, query_set

    def __len__(self):
        # as we have built up to batchsz of sets, you can sample some small batch size of sets.
        return self.num_task

In [7]:
low_resource_domains = ["office_products", "automotive", "computer_&_video_games"]
train_examples = [r for r in reviews if r['domain'] not in low_resource_domains]
test_examples = [r for r in reviews if r['domain'] in low_resource_domains]
print(len(train_examples), len(test_examples))

21555 300


In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case = True)
train = MetaTask(train_examples, num_task = 100, k_support=100, k_query=30, tokenizer = tokenizer)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
#Take a glance at the first two samples from support set of 1st meta-task
train.supports[0][:2]

[{'text': "A good way to make a cup of coffee. Particularly good if you want a fresh brewed taste, but don't need to brew a full pot for only one or two people. Can be stored in a cabinet or drawer, eliminating the use of counter space. A must have for small kitchens",
  'label': 'positive',
  'domain': 'kitchen_&_housewares'},
 {'text': 'The space between the strands is much greater than shown in the picture. The cutrain will not hide what is behind it',
  'label': 'negative',
  'domain': 'kitchen_&_housewares'}]

In [10]:
# Information of the 1st meta-task. It contains two TensorDataset: support set and query set
train[0]

(<torch.utils.data.dataset.TensorDataset at 0x7f0d009953d0>,
 <torch.utils.data.dataset.TensorDataset at 0x7f0cff8c9fd0>)

In [11]:
# Let take a look at the first two samples from support set
train[0][0][:2]

(tensor([[  101,  1037,  2204,  2126,  2000,  2191,  1037,  2452,  1997,  4157,
           1012,  3391,  2204,  2065,  2017,  2215,  1037,  4840, 24702,  2098,
           5510,  1010,  2021,  2123,  1005,  1056,  2342,  2000, 24702,  1037,
           2440,  8962,  2005,  2069,  2028,  2030,  2048,  2111,  1012,  2064,
           2022,  8250,  1999,  1037,  5239,  2030, 13065,  1010, 15349,  1996,
           2224,  1997,  4675,  2686,  1012,  1037,  2442,  2031,  2005,  2235,
          26051,   102,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,   

In [12]:
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

def random_seed(value):
    torch.backends.cudnn.deterministic=True
    torch.manual_seed(value)
    torch.cuda.manual_seed(value)
    np.random.seed(value)
    random.seed(value)

def create_batch_of_tasks(taskset, is_shuffle = True, batch_size = 4):
    idxs = list(range(0,len(taskset)))
    if is_shuffle:
        random.shuffle(idxs)
    for i in range(0,len(idxs), batch_size):
        yield [taskset[idxs[i]] for i in range(i, min(i + batch_size,len(taskset)))]

In [13]:
class TrainingArgs:
    def __init__(self):
        self.num_labels = 2
        self.meta_epoch=10
        self.k_spt=80
        self.k_qry=20
        self.outer_batch_size = 2
        self.inner_batch_size = 12
        self.outer_update_lr = 5e-5
        self.inner_update_lr = 5e-5
        self.inner_update_step = 10
        self.inner_update_step_eval = 40
        self.bert_model = 'bert-base-uncased'
        self.num_task_train = 500
        self.num_task_test = 5

args = TrainingArgs()

In [14]:
class Learner(nn.Module):
    """
    Meta Learner
    """
    def __init__(self, args):
        """
        :param args:
        """
        super(Learner, self).__init__()
        
        self.num_labels = args.num_labels
        self.outer_batch_size = args.outer_batch_size
        self.inner_batch_size = args.inner_batch_size
        self.outer_update_lr  = args.outer_update_lr
        self.inner_update_lr  = args.inner_update_lr
        self.inner_update_step = args.inner_update_step
        self.inner_update_step_eval = args.inner_update_step_eval
        self.bert_model = args.bert_model
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        self.model = BertForSequenceClassification.from_pretrained(self.bert_model, num_labels = self.num_labels)
        self.outer_optimizer = Adam(self.model.parameters(), lr=self.outer_update_lr)
        self.model.train()

    def forward(self, batch_tasks, training = True):
        """
        batch = [(support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset),
                 (support TensorDataset, query TensorDataset)]
        
        # support = TensorDataset(all_input_ids, all_attention_mask, all_segment_ids, all_label_ids)
        """
        task_accs = []
        sum_gradients = []
        num_task = len(batch_tasks)
        num_inner_update_step = self.inner_update_step if training else self.inner_update_step_eval

        for task_id, task in enumerate(batch_tasks):
            support = task[0]
            query   = task[1]
            
            fast_model = deepcopy(self.model)
            fast_model.to(self.device)
            support_dataloader = DataLoader(support, sampler=RandomSampler(support),
                                            batch_size=self.inner_batch_size)
            
            inner_optimizer = Adam(fast_model.parameters(), lr=self.inner_update_lr)
            fast_model.train()
            
            print('----Task',task_id, '----')
            for i in range(0,num_inner_update_step):
                all_loss = []
                for inner_step, batch in enumerate(support_dataloader):
                    
                    batch = tuple(t.to(self.device) for t in batch)
                    input_ids, attention_mask, segment_ids, label_id = batch
                    outputs = fast_model(input_ids, attention_mask, segment_ids, labels = label_id)
                    
                    loss = outputs[0]              
                    loss.backward()
                    inner_optimizer.step()
                    inner_optimizer.zero_grad()
                    
                    all_loss.append(loss.item())
                
                if i % 4 == 0:
                    print("Inner Loss: ", np.mean(all_loss))
            
            fast_model.to(torch.device('cpu'))
            
            if training:
                meta_weights = list(self.model.parameters())
                fast_weights = list(fast_model.parameters())

                gradients = []
                for i, (meta_params, fast_params) in enumerate(zip(meta_weights, fast_weights)):
                    gradient = meta_params - fast_params
                    if task_id == 0:
                        sum_gradients.append(gradient)
                    else:
                        sum_gradients[i] += gradient

            fast_model.to(self.device)
            fast_model.eval()
            with torch.no_grad():
                query_dataloader = DataLoader(query, sampler=None, batch_size=len(query))
                query_batch = iter(query_dataloader).next()
                query_batch = tuple(t.to(self.device) for t in query_batch)
                q_input_ids, q_attention_mask, q_segment_ids, q_label_id = query_batch
                q_outputs = fast_model(q_input_ids, q_attention_mask, q_segment_ids, labels = q_label_id)

                q_logits = F.softmax(q_outputs[1],dim=1)
                pre_label_id = torch.argmax(q_logits,dim=1)
                pre_label_id = pre_label_id.detach().cpu().numpy().tolist()
                q_label_id = q_label_id.detach().cpu().numpy().tolist()

                acc = accuracy_score(pre_label_id,q_label_id)
                task_accs.append(acc)
            
            fast_model.to(torch.device('cpu'))
            del fast_model, inner_optimizer
            torch.cuda.empty_cache()
        
        if training:
            # Average gradient across tasks
            for i in range(0,len(sum_gradients)):
                sum_gradients[i] = sum_gradients[i] / float(num_task)

            #Assign gradient for original model, then using optimizer to update its weights
            for i, params in enumerate(self.model.parameters()):
                params.grad = sum_gradients[i]

            self.outer_optimizer.step()
            self.outer_optimizer.zero_grad()
            
            del sum_gradients
            gc.collect()
        
        return np.mean(task_accs)

In [15]:
learner = Learner(args)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
random_seed(123)
test = MetaTask(test_examples, num_task = 3, k_support=40, k_query=20, tokenizer = tokenizer)
random_seed(int(time.time() % 10))

In [17]:
test.supports[2]

[{'text': 'i have been playing the madden series since i was a kid and this game is really good. the games just keep getting better and better as each year passes. everything in this game is worth the money i strongly reccommend it',
  'label': 'positive',
  'domain': 'computer_&_video_games'},
 {'text': 'I noticed there were not many new items in this game, I found this the advertising for this game to be deceiving. This expansion ws not worth the money',
  'label': 'negative',
  'domain': 'computer_&_video_games'},
 {'text': "This pack offered very little of anything. The decor (very little of it) is uninspired. I was hoping to get some good artwork at least. I know a lot of people don't like to download from fan sites but don't waste your money on this junk. Believe me fansites have much better and more creative stuff for your sims and most of it is free, and none of it cost as much as this glamour life stuff pack did. This is the first and last stuff pack that I will ever buy",
  '

global_step = 0

for epoch in range(args.meta_epoch):
    
    train = MetaTask(train_examples, num_task = 500, k_support=80, k_query=20, tokenizer = tokenizer)
    db = create_batch_of_tasks(train, is_shuffle = True, batch_size = args.outer_batch_size)

    for step, task_batch in enumerate(db):
        
        f = open('log.txt', 'a')
        
        acc = learner(task_batch)
        
        print('Step:', step, '\ttraining Acc:', acc)
        f.write(str(acc) + '\n')
        
        if global_step % 20 == 0:
            random_seed(123)
            print("\n-----------------Testing Mode-----------------\n")
            db_test = create_batch_of_tasks(test, is_shuffle = False, batch_size = 1)
            acc_all_test = []

            for test_batch in db_test:
                acc = learner(test_batch, training = False)
                acc_all_test.append(acc)

            print('Step:', step, 'Test F1:', np.mean(acc_all_test))
            f.write('Test' + str(np.mean(acc_all_test)) + '\n')
            
            random_seed(int(time.time() % 10))
        
        global_step += 1
        f.close()

###Reference

#####https://github.com/mailong25/meta-learning-bert