In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
%load_ext autoreload
%autoreload 2

import copy
import math
import random
import time
from collections import OrderedDict, defaultdict
from typing import Union, List

import numpy as np
import torch
import torch.nn as nn
from matplotlib import pyplot as plt
from torch.optim import *
from torch.optim.lr_scheduler import *
from torch.utils.data import DataLoader
from torchprofile import profile_macs
from torchvision.datasets import *
from torchvision.transforms import *
from tqdm.auto import tqdm
import torchvision.models as models

from torchprofile import profile_macs
# from torch.nn.parallel import DistributedDataParallel as DDP
# import torch.distributed as dist
from torch.nn.parallel import DataParallel


assert torch.cuda.is_available(), \
"CUDA support is not available."

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
# set device 
# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#set global device
# torch.cuda.set_device(device)
# dist.init_process_group(backend='nccl')

<torch._C.Generator at 0x7ff5e54f2fd0>

In [4]:
import itertools
import csv
import fire

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import tokenization
import models
import optim
import train

from utils import set_seeds, get_device, truncate_tokens_pair

In [5]:
class CsvDataset(Dataset):
    """ Dataset Class for CSV file """
    labels = None
    def __init__(self, file, pipeline=[]): # cvs file and pipeline object
        Dataset.__init__(self)
        data = []
        with open(file, "r") as f:
            # list of splitted lines : line is also list
            lines = csv.reader(f, delimiter='\t', quotechar=None)
            for instance in self.get_instances(lines): # instance : tuple of fields
                for proc in pipeline: # a bunch of pre-processing
                    instance = proc(instance)
                data.append(instance)

        # To Tensors
        self.tensors = [torch.tensor(x, dtype=torch.long) for x in zip(*data)]

    def __len__(self):
        return self.tensors[0].size(0)

    def __getitem__(self, index):
        return tuple(tensor[index] for tensor in self.tensors)

    def get_instances(self, lines):
        """ get instance array from (csv-separated) line list """
        raise NotImplementedError

In [6]:
class MRPC(CsvDataset):
    """ Dataset class for MRPC """
    labels = ("0", "1") # label names
    def __init__(self, file, pipeline=[]):
        super().__init__(file, pipeline)

    def get_instances(self, lines):
        for line in itertools.islice(lines, 1, None): # skip header
            yield line[0], line[3], line[4] # label, text_a, text_b


In [7]:
class MNLI(CsvDataset):
    """ Dataset class for MNLI """
    labels = ("contradiction", "entailment", "neutral") # label names
    def __init__(self, file, pipeline=[]):
        super().__init__(file, pipeline)

    def get_instances(self, lines):
        for line in itertools.islice(lines, 1, None): # skip header
            yield line[-1], line[8], line[9] # label, text_a, text_b


In [8]:
class Pipeline():
    """ Preprocess Pipeline Class : callable """
    def __init__(self):
        super().__init__()

    def __call__(self, instance):
        raise NotImplementedError

        
class Tokenizing(Pipeline):
    """ Tokenizing sentence pair """
    def __init__(self, preprocessor, tokenize):
        super().__init__()
        self.preprocessor = preprocessor # e.g. text normalization
        self.tokenize = tokenize # tokenize function

    def __call__(self, instance):
        label, text_a, text_b = instance

        label = self.preprocessor(label)
        tokens_a = self.tokenize(self.preprocessor(text_a))
        tokens_b = self.tokenize(self.preprocessor(text_b)) \
                   if text_b else []

        return (label, tokens_a, tokens_b)


In [9]:
class AddSpecialTokensWithTruncation(Pipeline):
    """ Add special tokens [CLS], [SEP] with truncation """
    def __init__(self, max_len=512):
        super().__init__()
        self.max_len = max_len

    def __call__(self, instance):
        label, tokens_a, tokens_b = instance

        # -3 special tokens for [CLS] text_a [SEP] text_b [SEP]
        # -2 special tokens for [CLS] text_a [SEP]
        _max_len = self.max_len - 3 if tokens_b else self.max_len - 2
        truncate_tokens_pair(tokens_a, tokens_b, _max_len)

        # Add Special Tokens
        tokens_a = ['[CLS]'] + tokens_a + ['[SEP]']
        tokens_b = tokens_b + ['[SEP]'] if tokens_b else []

        return (label, tokens_a, tokens_b)


class TokenIndexing(Pipeline):
    """ Convert tokens into token indexes and do zero-padding """
    def __init__(self, indexer, labels, max_len=512):
        super().__init__()
        self.indexer = indexer # function : tokens to indexes
        # map from a label name to a label index
        self.label_map = {name: i for i, name in enumerate(labels)}
        self.max_len = max_len

    def __call__(self, instance):
        label, tokens_a, tokens_b = instance

        input_ids = self.indexer(tokens_a + tokens_b)
        segment_ids = [0]*len(tokens_a) + [1]*len(tokens_b) # token type ids
        input_mask = [1]*(len(tokens_a) + len(tokens_b))

        label_id = self.label_map[label]

        # zero padding
        n_pad = self.max_len - len(input_ids)
        input_ids.extend([0]*n_pad)
        segment_ids.extend([0]*n_pad)
        input_mask.extend([0]*n_pad)

        return (input_ids, segment_ids, input_mask, label_id)


In [10]:
class Classifier(nn.Module):
    """ Classifier with Transformer """
    def __init__(self, cfg, n_labels):
        super().__init__()
        self.transformer = models.Transformer(cfg)
        self.fc = nn.Linear(cfg.dim, cfg.dim)
        self.activ = nn.Tanh()
        self.drop = nn.Dropout(cfg.p_drop_hidden)
        self.classifier = nn.Linear(cfg.dim, n_labels)

    def forward(self, input_ids, segment_ids, input_mask):
        h = self.transformer(input_ids, segment_ids, input_mask)
        # only use the first h in the sequence
        pooled_h = self.activ(self.fc(h[:, 0]))
        logits = self.classifier(self.drop(pooled_h))
        return logits


In [11]:
train_cfg = "config/train_mrpc.json"
model_cfg = "config/bert_base.json"
vocab = "/home/shariff/layers/pytorchic-bert-2/uncased_L-2_H-128_A-2/vocab.txt"
mode = "eval"
task = "mrpc"
data_file = "/home/shariff/glue_data/MRPC/dev.tsv"
model_file = "/data/shariff/bert_tiny/output_leaky_210/model_steps_300.pt"
max_len = 128
save_dir = "../exp/bert/mrpc"
data_parallel=True

In [12]:
def dataset_class(task):
    """ Mapping from task string to Dataset Class """
    table = {'mrpc': MRPC, 'mnli': MNLI}
    return table[task]


In [13]:
cfg = train.Config.from_json(train_cfg)
model_cfg = models.Config.from_json(model_cfg)

set_seeds(cfg.seed)

tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True)
TaskDataset = dataset_class(task) # task dataset class according to the task
pipeline = [Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            AddSpecialTokensWithTruncation(max_len),
            TokenIndexing(tokenizer.convert_tokens_to_ids,
                          TaskDataset.labels, max_len)]
dataset = TaskDataset(data_file, pipeline)
data_iter = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True)

model = Classifier(model_cfg, len(TaskDataset.labels))
criterion = nn.CrossEntropyLoss()

trainer = train.Trainer(cfg,
                        model,
                        data_iter,
                        optim.optim4GPU(cfg, model),
                        save_dir, get_device())



cuda (1 GPUs)


In [14]:
if mode == 'train':
    def get_loss(model, batch, global_step): # make sure loss is a scalar tensor
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        loss = criterion(logits, label_id)
        return loss

    trainer.train(get_loss, model_file, pretrain_file, data_parallel)

elif mode == 'eval':
    def evaluate(model, batch):
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float() #.cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    results = trainer.eval(evaluate, model_file, data_parallel)
    total_accuracy = torch.cat(results).mean().item()
    print('Accuracy:', total_accuracy)


Loading the model from /data/shariff/bert_tiny/output_leaky_210/model_steps_300.pt


Iter(acc=0.833): 100%|██████████| 13/13 [00:00<00:00, 22.39it/s]

Accuracy: 0.7156863212585449





In [15]:
trainer.load(model_file, None)

Loading the model from /data/shariff/bert_tiny/output_leaky_210/model_steps_300.pt


In [16]:
model = trainer.model.to(trainer.device)

In [17]:
model

Classifier(
  (transformer): Transformer(
    (embed): Embeddings(
      (tok_embed): Embedding(30522, 128)
      (pos_embed): Embedding(512, 128)
      (seg_embed): Embedding(2, 128)
      (norm): LayerNorm()
      (drop): Dropout(p=0.1, inplace=False)
    )
    (blocks): ModuleList(
      (0): BlockLeaky(
        (attn): MultiHeadedSelfAttention(
          (proj_q): Linear(in_features=128, out_features=128, bias=True)
          (proj_k): Linear(in_features=128, out_features=128, bias=True)
          (proj_v): Linear(in_features=128, out_features=128, bias=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
        (proj): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm()
        (pwff): PositionWiseFeedForwardLeaky(
          (fc1): Linear(in_features=128, out_features=512, bias=True)
          (fc2): Linear(in_features=512, out_features=128, bias=True)
        )
        (norm2): LayerNorm()
        (drop): Dropout(p=0.1, inplace=False)
   

In [18]:
class PositionWiseFeedForwardIdentity(nn.Module):
    """ FeedForward Neural Networks for each position """
    def __init__(self, cfg):
        super().__init__()
        self.fc1 = nn.Linear(128, 512)
        self.fc2 = nn.Linear(512, 128)
        #self.activ = lambda x: activ_fn(cfg.activ_fn, x)

    def forward(self, x):
        # (B, S, D) -> (B, S, D_ff) -> (B, S, D)
        activ_fn = nn.Identity()
        return self.fc2(activ_fn(self.fc1(x)))


In [19]:
pwff_new = PositionWiseFeedForwardIdentity(cfg)
pwff_new = copy.deepcopy(model.transformer.blocks[0].pwff)

In [20]:
w0 = pwff_new._modules['fc1'].weight.data
w3 = pwff_new._modules['fc2'].weight.data
b0 = pwff_new._modules['fc1'].bias.data
b3 = pwff_new._modules['fc2'].bias.data
pwff_new._modules['fc2'].weight.data = torch.matmul(w3, w0)
pwff_new._modules['fc2'].bias.data = torch.matmul(w3, b0) + b3
pwff_new._modules['fc1'] = nn.Identity()

In [21]:
compressed_model = copy.deepcopy(model)

In [22]:
compressed_model.transformer.blocks[0].pwff = copy.deepcopy(pwff_new)

In [23]:
print(compressed_model)

Classifier(
  (transformer): Transformer(
    (embed): Embeddings(
      (tok_embed): Embedding(30522, 128)
      (pos_embed): Embedding(512, 128)
      (seg_embed): Embedding(2, 128)
      (norm): LayerNorm()
      (drop): Dropout(p=0.1, inplace=False)
    )
    (blocks): ModuleList(
      (0): BlockLeaky(
        (attn): MultiHeadedSelfAttention(
          (proj_q): Linear(in_features=128, out_features=128, bias=True)
          (proj_k): Linear(in_features=128, out_features=128, bias=True)
          (proj_v): Linear(in_features=128, out_features=128, bias=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
        (proj): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm()
        (pwff): PositionWiseFeedForwardLeaky(
          (fc1): Identity()
          (fc2): Linear(in_features=512, out_features=128, bias=True)
        )
        (norm2): LayerNorm()
        (drop): Dropout(p=0.1, inplace=False)
      )
      (1): BlockIdentity(
        (at

In [24]:
trainer.model = copy.deepcopy(compressed_model)

In [25]:
# w0 = model.transformer.blocks[11].pwff._modules['fc1'].weight.data
# w3 = model.transformer.blocks[11].pwff._modules['fc2'].weight.data
# b0 = model.transformer.blocks[11].pwff._modules['fc1'].bias.data
# b3 = model.transformer.blocks[11].pwff._modules['fc2'].bias.data
# model.transformer.blocks[11].pwff._modules['fc2'].weight.data = torch.matmul(w3, w0)
# model.transformer.blocks[11].pwff._modules['fc2'].bias.data = torch.matmul(w3, b0) + b3
# model.transformer.blocks[11].pwff._modules['fc1'] = nn.Identity()

In [26]:
# def collapse_layers_bert(model, relu_id=11):
#     w0 = model.classifier._modules[str(relu_id - 1)].weight.data
#     w3 = model.classifier._modules[str(relu_id + 2)].weight.data
#     b0 = model.classifier._modules[str(relu_id - 1)].bias.data
#     b3 = model.classifier._modules[str(relu_id + 2)].bias.data
#     model.classifier._modules[str(relu_id + 2)].weight.data = torch.matmul(w3, w0)
#     model.classifier._modules[str(relu_id + 2)].bias.data = torch.matmul(w3, b0) + b3

#     model.classifier._modules[str(relu_id - 1)] = nn.Identity()
#     model.classifier._modules[str(relu_id)] = nn.Identity()
#     model.classifier._modules[str(relu_id + 1)] = nn.Identity()
    


In [27]:
trainer.model

Classifier(
  (transformer): Transformer(
    (embed): Embeddings(
      (tok_embed): Embedding(30522, 128)
      (pos_embed): Embedding(512, 128)
      (seg_embed): Embedding(2, 128)
      (norm): LayerNorm()
      (drop): Dropout(p=0.1, inplace=False)
    )
    (blocks): ModuleList(
      (0): BlockLeaky(
        (attn): MultiHeadedSelfAttention(
          (proj_q): Linear(in_features=128, out_features=128, bias=True)
          (proj_k): Linear(in_features=128, out_features=128, bias=True)
          (proj_v): Linear(in_features=128, out_features=128, bias=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
        (proj): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm()
        (pwff): PositionWiseFeedForwardLeaky(
          (fc1): Identity()
          (fc2): Linear(in_features=512, out_features=128, bias=True)
        )
        (norm2): LayerNorm()
        (drop): Dropout(p=0.1, inplace=False)
      )
      (1): BlockIdentity(
        (at

In [28]:
def evaluate_compressed(compressed_model, batch):
    input_ids, segment_ids, input_mask, label_id = batch
    logits = compressed_model(input_ids, segment_ids, input_mask)
    # print(compressed_model)
    _, label_pred = logits.max(1)
    result = (label_pred == label_id).float() #.cpu().numpy()
    accuracy = result.mean()
    return accuracy, result

# results = trainer.eval(evaluate_compressed, model_file, data_parallel)


In [29]:
results = [] 
iter_bar = tqdm(trainer.data_iter, desc='Iter (loss=X.XXX)')
for batch in iter_bar:
    batch = [t.to(trainer.device) for t in batch]
    with torch.no_grad(): # evaluation without gradient calculation
        accuracy, result = evaluate_compressed(compressed_model, batch) # accuracy to print
    results.append(result)

    iter_bar.set_description('Iter(acc=%5.3f)'%accuracy)

total_accuracy = torch.cat(results).mean().item()
print('Accuracy:', total_accuracy)


Iter(acc=0.750): 100%|██████████| 13/13 [00:00<00:00, 314.33it/s]

Accuracy: 0.7156863212585449





In [30]:
print(sum(p.numel() for p in model.parameters()))

4386178


In [31]:
print(sum(p.numel() for p in compressed_model.parameters()))

4270978


In [30]:
print(sum(p.numel() for p in compressed_model.parameters()))

4155778


In [31]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 16.292MB


In [32]:
param_size = 0
for param in compressed_model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in compressed_model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('compressed model size: {:.3f}MB'.format(size_all_mb))

compressed model size: 15.853MB


In [33]:
torch.save(compressed_model.state_dict(), "/data/shariff/bert_tiny/output_leaky_210/compressed_model.pt")


In [35]:
compressed_model

Classifier(
  (transformer): Transformer(
    (embed): Embeddings(
      (tok_embed): Embedding(30522, 128)
      (pos_embed): Embedding(512, 128)
      (seg_embed): Embedding(2, 128)
      (norm): LayerNorm()
      (drop): Dropout(p=0.1, inplace=False)
    )
    (blocks): ModuleList(
      (0): Block(
        (attn): MultiHeadedSelfAttention(
          (proj_q): Linear(in_features=128, out_features=128, bias=True)
          (proj_k): Linear(in_features=128, out_features=128, bias=True)
          (proj_v): Linear(in_features=128, out_features=128, bias=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
        (proj): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm()
        (pwff): PositionWiseFeedForward(
          (fc1): Linear(in_features=128, out_features=512, bias=True)
          (fc2): Linear(in_features=512, out_features=128, bias=True)
        )
        (norm2): LayerNorm()
        (drop): Dropout(p=0.1, inplace=False)
      )
     