## Distillation of MobileBERT

In [17]:
import torch
from datasets import load_dataset
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score,f1_score
from transformers import TextClassificationPipeline,Trainer, AutoModelForSequenceClassification,AutoTokenizer,TrainingArguments

## Baseline

In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Using device: cuda


In [6]:
clinc = load_dataset("clinc_oos","plus")
clinc

Reusing dataset clinc_oos (C:\Users\Subha\.cache\huggingface\datasets\clinc_oos\plus\1.0.0\abcc41d382f8137f039adc747af44714941e8196e845dfbdd8ae7a7e020e6ba1)


DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['text', 'intent'],
        num_rows: 5500
    })
})

In [8]:
teacher_checkpoint = "lewtun/bert-base-uncased-finetuned-clinc"
teacher_model = (AutoModelForSequenceClassification
                 .from_pretrained(teacher_checkpoint, num_labels=clinc['train'].features['intent'].num_classes)
                 .to(device))
teacher_tokenizer = AutoTokenizer.from_pretrained(teacher_checkpoint)
teacher_model.to('cpu')

pipe = TextClassificationPipeline(model = teacher_model,tokenizer= teacher_tokenizer)

Downloading: 100%|██████████| 252/252 [00:00<00:00, 126kB/s]
Downloading: 100%|██████████| 232k/232k [00:01<00:00, 230kB/s] 
Downloading: 100%|██████████| 112/112 [00:00<00:00, 112kB/s]


In [9]:
pipe('I need to book a cab to the airport')

[{'label': 'uber', 'score': 0.29016366600990295}]

In [10]:
def tokenize(batch):
    return teacher_tokenizer(batch['text'],padding=True,truncation=True)

clinc.reset_format()
clinc.rename_column_('intent','labels')
clinc_encoded = clinc.map(tokenize,batched=True,batch_size=None)
clinc_encoded.set_format('torch',columns=['input_ids','token_type_ids','attention_mask','labels'])
labels = clinc_encoded['test'].features['labels'].names

  """
100%|██████████| 1/1 [00:00<00:00,  1.81ba/s]
100%|██████████| 1/1 [00:00<00:00, 12.66ba/s]
100%|██████████| 1/1 [00:00<00:00,  5.49ba/s]


In [18]:
from time import perf_counter
from datasets import load_metric

def time_pipeline(pipeline, query="What is the pin number for my account?"):
    latencies = []
    # Warmup
    for _ in range(10):
        _ = pipeline(query)
    # Timed run
    for _ in range(100):
        start_time = perf_counter()
        _ = pipeline(query)
        latency = perf_counter() - start_time
        latencies.append(latency)
    # Compute run statistics
    time_avg_ms = 1000 * np.mean(latencies)
    time_std_ms = 1000 * np.std(latencies)
    print(f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
    return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}

# accuracy_score = load_metric('accuracy')
# def compute_accuracy(pipeline,dataset):
#     preds, labels = [], []
#     for example in dataset:
#         pred = pipeline(example["text"])[0]["label"]
#         label = example["intent"]
#         preds.append(intents.str2int(pred))
#         labels.append(label)
#     accuracy = accuracy_score.compute(predictions=preds, references=labels)
#     print(f"Accuracy on test set - {accuracy['accuracy']:.3f}")
#     return accuracy

teacher_trainer = Trainer(teacher_model)
teacher_trainer.model.to('cuda')
test_preds = teacher_trainer.predict(clinc_encoded['test'])
test_preds.metrics


No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 5500
  Batch size = 8
100%|█████████▉| 685/688 [00:20<00:00, 33.33it/s]

{'test_loss': 1.2073315382003784,
 'test_runtime': 21.0883,
 'test_samples_per_second': 260.808,
 'test_steps_per_second': 32.625}

100%|██████████| 688/688 [00:39<00:00, 33.33it/s]

In [19]:
labels = clinc_encoded['test'].features['labels'].names

import numpy as np
from sklearn.metrics import classification_report

y_preds = np.argmax(test_preds.predictions,axis=1)
y_test = np.array(clinc_encoded['test']['labels'])

print(classification_report(y_test,y_preds,target_names=labels))

                           precision    recall  f1-score   support

       restaurant_reviews       0.83      0.83      0.83        30
           nutrition_info       0.88      1.00      0.94        30
          account_blocked       0.93      0.93      0.93        30
           oil_change_how       0.77      1.00      0.87        30
                     time       0.83      0.97      0.89        30
                  weather       0.68      1.00      0.81        30
           redeem_rewards       0.91      0.67      0.77        30
            interest_rate       0.85      0.97      0.91        30
                 gas_type       0.94      1.00      0.97        30
      accept_reservations       0.96      0.90      0.93        30
               smart_home       0.70      1.00      0.82        30
                user_name       0.82      0.90      0.86        30
         report_lost_card       0.96      0.90      0.93        30
                   repeat       0.90      0.93      0.92     

In [20]:
import torch.nn as nn
import torch.nn.functional as F

class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature

class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model

    def compute_loss(self, model, inputs,return_outputs=False):
        outputs_stu = model(**inputs)
        # Extract cross-entropy loss and logits from student
        loss_ce = outputs_stu.loss
        logits_stu = outputs_stu.logits
        # Extract logits from teacher
        with torch.no_grad():
            outputs_tea = self.teacher_model(**inputs)
            logits_tea = outputs_tea.logits
        # Soften probabilities and compute distillation loss
        loss_fct = nn.KLDivLoss(reduction="batchmean")
        loss_kd = self.args.temperature ** 2 * loss_fct(
            F.log_softmax(logits_stu / self.args.temperature, dim=-1),
            F.softmax(logits_tea / self.args.temperature, dim=-1))
        # Return weighted student loss
        loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
        if return_outputs:
            return  loss,outputs_stu
        return loss

In [21]:
student_ckpt = "google/mobilebert-uncased"
student_tokenizer = AutoTokenizer.from_pretrained(student_ckpt)

def tokenize_text(batch, tokenizer):
    return tokenizer(batch["text"], truncation=True)

clinc_enc = clinc.map(tokenize_text, batched=True, remove_columns=["text"],
                      fn_kwargs={"tokenizer": student_tokenizer})
# clinc_enc.rename_column_("intent", "labels")

Could not locate the tokenizer configuration file, will try to use the model config instead.
loading configuration file https://huggingface.co/google/mobilebert-uncased/resolve/main/config.json from cache at C:\Users\Subha/.cache\huggingface\transformers\f55082bb509cfd3585c1d6e383f529197f03fce817374e4edf03306593ef4c16.8198c2ba6b951372d5ca981b940a059b96ebbb3c514469a2aae7636fa05eb25e
Model config MobileBertConfig {
  "_name_or_path": "google/mobilebert-uncased",
  "architectures": [
    "MobileBertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": false,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "intra_bottleneck_size": 128,
  "key_query_shared_bottleneck": true,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "mobilebert",
  "normalization_type": "no_norm",
  "num_attention_heads": 4,
  "num_feedforward_net

In [24]:
clinc_enc

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 15250
    })
    validation: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 3100
    })
    test: Dataset({
        features: ['attention_mask', 'input_ids', 'labels', 'token_type_ids'],
        num_rows: 5500
    })
})

In [28]:
import torch
from transformers import AutoConfig

num_labels = clinc['train'].features['labels'].num_classes
id2label = teacher_model.config.id2label
label2id = teacher_model.config.label2id
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

student_config = (AutoConfig
                  .from_pretrained(student_ckpt, num_labels=num_labels,
                                   id2label=id2label, label2id=label2id))

def student_init():
    return (AutoModelForSequenceClassification
            .from_pretrained(student_ckpt, config=student_config).to(device))

loading configuration file https://huggingface.co/google/mobilebert-uncased/resolve/main/config.json from cache at C:\Users\Subha/.cache\huggingface\transformers\f55082bb509cfd3585c1d6e383f529197f03fce817374e4edf03306593ef4c16.8198c2ba6b951372d5ca981b940a059b96ebbb3c514469a2aae7636fa05eb25e
Model config MobileBertConfig {
  "_name_or_path": "google/mobilebert-uncased",
  "architectures": [
    "MobileBertForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_activation": false,
  "embedding_size": 128,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 512,
  "id2label": {
    "0": "restaurant_reviews",
    "1": "nutrition_info",
    "2": "account_blocked",
    "3": "oil_change_how",
    "4": "time",
    "5": "weather",
    "6": "redeem_rewards",
    "7": "interest_rate",
    "8": "gas_type",
    "9": "accept_reservations",
    "10": "smart_home",
    "11": "user_name",
    "12": "report_lost_card",
    "13": "repeat",
    "14": "whisper_mode",
  

In [29]:
from datasets import load_metric
accuracy_score = load_metric('accuracy')


def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_score.compute(predictions=predictions, references=labels)

batch_size = 48

student_training_args = DistillationTrainingArguments(
    output_dir="checkpoints", evaluation_strategy = "epoch", num_train_epochs=5,
    learning_rate=2e-5, per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size, alpha=1, weight_decay=0.01)

# teacher_checkpoint = "./models/bert-base-uncased"
teacher_model = (AutoModelForSequenceClassification
                 .from_pretrained(teacher_checkpoint, num_labels=num_labels)
                 .to(device))

distil_trainer = DistillationTrainer(model_init=student_init,
    teacher_model=teacher_model, args=student_training_args,
    train_dataset=clinc_enc['train'], eval_dataset=clinc_enc['validation'],
    compute_metrics=compute_metrics, tokenizer=student_tokenizer)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/lewtun/bert-base-uncased-finetuned-clinc/resolve/main/config.json from cache at C:\Users\Subha/.cache\huggingface\transformers\fadfd2f06b239b712a8f18d5f8798ce824386593949d56483f9261919807ab9a.ad7a70ba622652431d76da8a4fd553d218c0ec90de65cd31dcdbde521dae3567
Model config BertConfig {
  "_name_or_path": "models/bert-base-uncased-finetuned-clinc",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "restaurant_reviews",
    "1": "nutrition_info",
    "2": "account_bloc

In [30]:
distil_trainer.train()

loading weights file https://huggingface.co/google/mobilebert-uncased/resolve/main/pytorch_model.bin from cache at C:\Users\Subha/.cache\huggingface\transformers\0f147299bd78a76bb975368144d5c4c35675bc08f3a0171851dc828048f6b5dd.b8a0b89cb6a580f7491453c965bfcaa548c1d89f869773f20510aba2ccd44eb5
Some weights of the model checkpoint at google/mobilebert-uncased were not used when initializing MobileBertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing MobileBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification mode

{'eval_loss': 2.7092342376708984, 'eval_accuracy': 0.40838709677419355, 'eval_runtime': 10.0584, 'eval_samples_per_second': 308.199, 'eval_steps_per_second': 6.462, 'epoch': 1.0}



100%|██████████| 688/688 [10:09<00:00, 33.33it/s]Saving model checkpoint to checkpoints\checkpoint-500
Configuration saved in checkpoints\checkpoint-500\config.json


{'loss': 306951.36, 'learning_rate': 1.371069182389937e-05, 'epoch': 1.57}


Model weights saved in checkpoints\checkpoint-500\pytorch_model.bin
tokenizer config file saved in checkpoints\checkpoint-500\tokenizer_config.json
Special tokens file saved in checkpoints\checkpoint-500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3100
  Batch size = 48

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


100%|██████████| 688/688 [11:48<00:00, 33.33it/s]
[A

{'eval_loss': 1.0190551280975342, 'eval_accuracy': 0.7412903225806452, 'eval_runtime': 8.3343, 'eval_samples_per_second': 371.956, 'eval_steps_per_second': 7.799, 'epoch': 2.0}


***** Running Evaluation *****
  Num examples = 3100
  Batch size = 48

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


100%|██████████| 688/688 [15:29<00:00, 33.33it/s]
[A

{'eval_loss': 0.7450012564659119, 'eval_accuracy': 0.8151612903225807, 'eval_runtime': 9.2336, 'eval_samples_per_second': 335.73, 'eval_steps_per_second': 7.04, 'epoch': 3.0}



100%|██████████| 688/688 [15:59<00:00, 33.33it/s]Saving model checkpoint to checkpoints\checkpoint-1000
Configuration saved in checkpoints\checkpoint-1000\config.json


{'loss': 1.3066, 'learning_rate': 7.421383647798742e-06, 'epoch': 3.14}


Model weights saved in checkpoints\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in checkpoints\checkpoint-1000\tokenizer_config.json
Special tokens file saved in checkpoints\checkpoint-1000\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3100
  Batch size = 48

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


100%|██████████| 688/688 [19:08<00:00, 33.33it/s]
[A

{'eval_loss': 0.6600404977798462, 'eval_accuracy': 0.8351612903225807, 'eval_runtime': 9.028, 'eval_samples_per_second': 343.374, 'eval_steps_per_second': 7.2, 'epoch': 4.0}



100%|██████████| 688/688 [21:35<00:00, 33.33it/s]Saving model checkpoint to checkpoints\checkpoint-1500
Configuration saved in checkpoints\checkpoint-1500\config.json


{'loss': 1.0568, 'learning_rate': 1.1320754716981133e-06, 'epoch': 4.72}


Model weights saved in checkpoints\checkpoint-1500\pytorch_model.bin
tokenizer config file saved in checkpoints\checkpoint-1500\tokenizer_config.json
Special tokens file saved in checkpoints\checkpoint-1500\special_tokens_map.json
***** Running Evaluation *****
  Num examples = 3100
  Batch size = 48

[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


100%|██████████| 688/688 [22:43<00:00, 33.33it/s]
[A

Training completed. Do not forget to share your model on huggingface.co/models =)



100%|██████████| 1590/1590 [18:03<00:00,  1.47it/s]

{'eval_loss': 0.6295708417892456, 'eval_accuracy': 0.8461290322580645, 'eval_runtime': 9.126, 'eval_samples_per_second': 339.688, 'eval_steps_per_second': 7.122, 'epoch': 5.0}
{'train_runtime': 1083.9992, 'train_samples_per_second': 70.341, 'train_steps_per_second': 1.467, 'train_loss': 96526.39727523131, 'epoch': 5.0}





TrainOutput(global_step=1590, training_loss=96526.39727523131, metrics={'train_runtime': 1083.9992, 'train_samples_per_second': 70.341, 'train_steps_per_second': 1.467, 'train_loss': 96526.39727523131, 'epoch': 5.0})

In [40]:
distilled_model = distil_trainer.model
distilled_model.to('cpu')
pipe_distilled = TextClassificationPipeline(model = distilled_model,tokenizer= student_tokenizer)
time_pipeline(pipe_distilled)

Average latency (ms) - 39.02 +\- 2.72


{'time_avg_ms': 39.02379400000427, 'time_std_ms': 2.723940705737778}

In [34]:
distil_trainer.model.to('cuda')
test_preds = distil_trainer.predict(clinc_encoded['test'])
test_preds.metrics

y_preds = np.argmax(test_preds.predictions,axis=1)
y_test = np.array(clinc_encoded['test']['labels'])

print(classification_report(y_test,y_preds,target_names=labels))

The following columns in the test set  don't have a corresponding argument in `MobileBertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 5500
  Batch size = 48


                           precision    recall  f1-score   support

       restaurant_reviews       0.53      0.63      0.58        30
           nutrition_info       0.64      1.00      0.78        30
          account_blocked       0.61      0.77      0.68        30
           oil_change_how       0.83      1.00      0.91        30
                     time       0.85      0.93      0.89        30
                  weather       0.60      1.00      0.75        30
           redeem_rewards       0.61      0.83      0.70        30
            interest_rate       0.81      0.97      0.88        30
                 gas_type       0.93      0.93      0.93        30
      accept_reservations       0.80      0.80      0.80        30
               smart_home       0.57      0.93      0.71        30
                user_name       0.77      0.67      0.71        30
         report_lost_card       0.90      0.90      0.90        30
                   repeat       0.70      0.87      0.78     

In [36]:
from torch.quantization import quantize_dynamic

distil_trainer.model.to('cpu')
model_quantized = quantize_dynamic(distilled_model,{torch.nn.Linear},dtype=torch.qint8)


In [41]:
pipe_quantized = TextClassificationPipeline(model = model_quantized,tokenizer= student_tokenizer)
time_pipeline(pipe_quantized)

Average latency (ms) - 38.93 +\- 2.44


{'time_avg_ms': 38.9301899999964, 'time_std_ms': 2.4362531855591034}

In [39]:
### Original Model

teacher_model.to('cpu')
pipe = TextClassificationPipeline(model = teacher_model,tokenizer= teacher_tokenizer)
time_pipeline(pipe)

Average latency (ms) - 43.85 +\- 10.89


{'time_avg_ms': 43.85319699998945, 'time_std_ms': 10.888959137988236}