## Low latency inference for Transformers on Cloud

In [None]:
#!pip -qqq install transformers datasets onnxruntime onnx neural-compressor rich

#### Finetune full & pruned models and run benchmarks

##### Imports

In [21]:
import numpy as np
import torch
import time
from datasets import load_dataset, load_metric
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, 
                          Trainer, TrainingArguments)
import onnxruntime, onnx
from tqdm import tqdm
from onnxruntime.quantization import quantize_dynamic,QuantType
from rich import print
from torch import nn
import copy

In [None]:
n_samples=500
pretrained_model="nreimers/MiniLM-L6-H384-uncased"

In [22]:
dataset = load_dataset("imdb")



  0%|          | 0/3 [00:00<?, ?it/s]

##### Created training and validation samples

In [23]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
tokenized_datasets = dataset.map(lambda examples: tokenizer(examples['text'], 
                                                            padding="max_length", 
                                                            truncation=True, 
                                                            max_length=128), batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(n_samples))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(n_samples))

loading configuration file https://huggingface.co/nreimers/MiniLM-L6-H384-uncased/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/1e88d8811a541416b53dd21925363de2dd87b7737356fd697ad69e71b78ce3ac.68ccc662d73d8cbbd9461fca6ed6a31ed070e569a9ccbfbdbb5fa4eff33012e5
Model config BertConfig {
  "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(pretrained_model, num_labels=2)

loading configuration file https://huggingface.co/nreimers/MiniLM-L6-H384-uncased/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/1e88d8811a541416b53dd21925363de2dd87b7737356fd697ad69e71b78ce3ac.68ccc662d73d8cbbd9461fca6ed6a31ed070e569a9ccbfbdbb5fa4eff33012e5
Model config BertConfig {
  "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
  "architectures": [
    "BertModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 1536,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://hugging

### Prune Layers

In [25]:
# https://github.com/huggingface/transformers/issues/2483


def deleteEncodingLayers(model, num_layers_to_keep):  # must pass in the full bert model
    oldModuleList = model.bert.encoder.layer
    newModuleList = nn.ModuleList()

    # Now iterate over all layers, only keepign only the relevant layers.
    for i in range(num_layers_to_keep):
        newModuleList.append(oldModuleList[i])

    # create a copy of the model, modify it with the new list, and return
    copyOfModel = copy.deepcopy(model)
    copyOfModel.bert.encoder.layer = newModuleList

    return copyOfModel

In [26]:
pruned_model = deleteEncodingLayers(model, 2)

##### Create function for calculating metrics

In [27]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric.compute(predictions=predictions, references=labels)
    return accuracy

##### Specify training arguments

In [28]:
training_args = TrainingArguments  (output_dir="test_trainer", 
                                    evaluation_strategy="steps", 
                                    num_train_epochs=20,
                                    logging_steps=50,
                                    per_device_train_batch_size=32,
                                    per_device_eval_batch_size=32,
                                    load_best_model_at_end=True)

using `logging_steps` to initialize `eval_steps` to 50
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


##### Train the full and pruned models

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 500
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 320


Step,Training Loss,Validation Loss,Accuracy
50,0.5603,0.538632,0.746
100,0.2682,0.729343,0.744
150,0.0795,1.03566,0.75
200,0.0275,1.27893,0.754
250,0.031,1.284288,0.746
300,0.023,1.305896,0.748


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 32
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 32
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 32
The following

TrainOutput(global_step=320, training_loss=0.15699612051248552, metrics={'train_runtime': 492.5206, 'train_samples_per_second': 20.304, 'train_steps_per_second': 0.65, 'total_flos': 82914524160000.0, 'train_loss': 0.15699612051248552, 'epoch': 20.0})

In [31]:
trainer = Trainer(
    model=pruned_model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics
)
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 500
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 320


Step,Training Loss,Validation Loss,Accuracy
50,0.6828,0.652522,0.668
100,0.3879,0.616683,0.734
150,0.1696,0.754027,0.734
200,0.0858,0.881245,0.772
250,0.0551,0.96711,0.766
300,0.0274,1.050235,0.768


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 32
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 32
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 500
  Batch size = 32
The following

TrainOutput(global_step=320, training_loss=0.22341151535511017, metrics={'train_runtime': 184.169, 'train_samples_per_second': 54.298, 'train_steps_per_second': 1.738, 'total_flos': 28402990080000.0, 'train_loss': 0.22341151535511017, 'epoch': 20.0})

### Export to ONNX

In [32]:
# Convert to ONNX
def save_to_onnx(model,path):
    model.to('cpu')
    model.eval();
    input_ids = torch.tensor([[0]]) 
    attention_mask = torch.tensor([[0]]) 

    # Save as ONNX format
    torch.onnx.export(model, (input_ids, attention_mask), 
                    path, 
                    input_names = ['input_ids', 'attention_mask'], 
                    opset_version = 11,
                    dynamic_axes = {'input_ids': {0: 'batch_size', 1: 'channel'},
                                    'attention_mask': {0: 'batch_size', 1: 'channel'}}
                    ) 

In [33]:
save_to_onnx(model,"classification_model_fp32_unpruned.onnx")
save_to_onnx(pruned_model,"classification_model_fp32.onnx")

##### Convert dataset to torch format

In [34]:
small_eval_dataset.set_format(type='torch', columns=['label', 'input_ids', 'token_type_ids', 'attention_mask'])

##### Function to estimate average inference time and accuracy

In [35]:
def run_benchmark(model, dataset=small_eval_dataset,is_inc=1):
  if is_inc: 
    ort_session = onnxruntime.InferenceSession(model.SerializeToString(), None)
  else:
    ort_session = onnxruntime.InferenceSession(model)
  total_diff, tot_len = 0, 0
  preds, gt, hist = [], [], []
  eval_dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)
  tk0 = tqdm(eval_dataloader, total=len(eval_dataloader))

  for bi, d in enumerate(tk0):
    start = time.time()
    ids = d["input_ids"]
    mask = d["attention_mask"]

    # ***** For ONNX ********
    ids = ids.cpu().detach().numpy()
    mask = mask.cpu().detach().numpy()
    ort_inputs = {"input_ids" : ids, "attention_mask": mask}
    ort_outs = ort_session.run(None, ort_inputs)
    end = time.time()

    label = d["label"]
    label = (label.cpu().detach().numpy()).item(0)

    ort_outs = torch.tensor(np.array(ort_outs))
    pred = np.argmax(torch.sigmoid(ort_outs).squeeze().cpu().detach().numpy())
    preds.append(pred)
    gt.append(label)

    if bi > 20:
      diff = end - start
      total_diff = total_diff+diff
      tot_len = tot_len+1      
      hist.append(diff*1000)
        
  print("mean:",np.round(np.mean(hist),2),"ms, ",
        "std:",np.round(np.std(hist),2),"ms, ",
        "min:",np.round(np.min(hist),2),"ms, ",
        "max:",np.round(np.max(hist),2),"ms, ",
        "median:",np.round(np.median(hist),2),"ms, ",
        "95p:",np.round(np.percentile(hist,95),2),"ms, ",
        "99p:",np.round(np.percentile(hist,99),2),"ms")
  result = metric.compute(predictions=preds, references=gt)
  acc = result['accuracy']
  return acc

##### Run benchmarks for pruend and full FP32  models

In [36]:
run_benchmark("classification_model_fp32_unpruned.onnx", is_inc=0)

100%|██████████| 500/500 [00:08<00:00, 59.33it/s]


0.746

In [37]:
run_benchmark("classification_model_fp32.onnx", is_inc=0)

100%|██████████| 500/500 [00:03<00:00, 153.61it/s]


0.768

#### Convert Pruned Model to INT8 and run benchmarks.

##### Quantize the model

In [38]:
def quantize(FP32_model,INT8_model):
    onnx_model = onnx.load(FP32_model)
    onnx.checker.check_model(onnx_model)

    quantize_dynamic(model_input = FP32_model, 
                                    model_output = INT8_model,
                                    weight_type=QuantType.QUInt8
                                    )


In [40]:
quantize("classification_model_fp32.onnx","classification_model_int8.onnx")

In [41]:
run_benchmark("classification_model_int8.onnx", is_inc=0)

100%|██████████| 500/500 [00:01<00:00, 264.45it/s]


0.766

## Intel Neural Compressor

##### YAML Configuration for Neural compressor

In [43]:
%%writefile quantization.yml
model:                                               # mandatory. used to specify model specific information.
  name: bert 
  framework: onnxrt_integerops                       # mandatory. possible values are tensorflow, mxnet, pytorch, pytorch_ipex, onnxrt_integerops and onnxrt_qlinearops.

quantization:
  approach: post_training_dynamic_quant              # optional. default value is post_training_static_quant.                                   

tuning:
  accuracy_criterion:
    relative:  0.01                                  # optional. default value is relative, other value is absolute. this example allows relative accuracy loss: 1%.
  exit_policy:
    timeout: 0                                       # optional. tuning timeout (seconds). default value is 0 which means early stop. combine with max_trials field to decide when to exit.
    max_trials: 1200
  random_seed: 9527                                  # optional. random seed for deterministic tuning.

Overwriting quantization.yml


In [44]:
from neural_compressor.experimental import Quantization, common

def inc_quantize(input_path,quant_config,save_path,tolerance=0.1):
    model=onnx.load(input_path)
    quantizer = Quantization(quant_config)
    quantizer.cfg.tuning.accuracy_criterion.relative=tolerance
    quantizer.model = common.Model(model)
    quantizer.tokenizer= AutoTokenizer.from_pretrained("nreimers/MiniLMv2-L6-H768-distilled-from-BERT-Large")
    quantizer.eval_func = run_benchmark
    q_model = quantizer()
    q_model.save(save_path)

##### Quantize FP32 model with NC and run benchmarks

In [45]:
input_path='classification_model_fp32.onnx'
quant_config="./quantization.yml"
save_path='classification_model_int8_0.1.onnx'

inc_quantize(input_path,quant_config,save_path)

loading configuration file https://huggingface.co/nreimers/MiniLMv2-L6-H768-distilled-from-BERT-Large/resolve/main/config.json from cache at /home/ubuntu/.cache/huggingface/transformers/2e97e014ce3467ee0eb735658065d9786cea77ca2f3b39479387b1e1ea231acd.ff6da012d9fcb6dcf0ca4ad3ef422ee7e31d68144ba91648f4dc4c445db414f0
Model config BertConfig {
  "_name_or_path": "nreimers/MiniLMv2-L6-H768-distilled-from-BERT-Large",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.17.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://hugg

2022-03-21 14:27:04 [INFO] Save tuning history to /home/ubuntu/devcon/nc_workspace/2022-03-21_14-26-55/./history.snapshot.
2022-03-21 14:27:04 [INFO] FP32 baseline is: [Accuracy: 0.7680, Duration (seconds): 3.1306]
2022-03-21 14:27:06 [INFO] |**********Mixed Precision Statistics*********|
2022-03-21 14:27:06 [INFO] +-----------------------+-------+------+------+
2022-03-21 14:27:06 [INFO] |        Op Type        | Total | INT8 | FP32 |
2022-03-21 14:27:06 [INFO] +-----------------------+-------+------+------+
2022-03-21 14:27:06 [INFO] |         Gather        |   7   |  3   |  4   |
2022-03-21 14:27:06 [INFO] |         MatMul        |   15  |  15  |  0   |
2022-03-21 14:27:06 [INFO] | DynamicQuantizeLinear |   13  |  13  |  0   |
2022-03-21 14:27:06 [INFO] +-----------------------+-------+------+------+
2022-03-21 14:27:06 [INFO] Pass quantize model elapsed time: 1714.7 ms
100%|██████████| 500/500 [00:01<00:00, 261.07it/s]


2022-03-21 14:27:07 [INFO] Tune 1 result is: [Accuracy (int8|fp32): 0.7780|0.7680, Duration (seconds) (int8|fp32): 1.9556|3.1306], Best tune result is: [Accuracy: 0.7780, Duration (seconds): 1.9556]
2022-03-21 14:27:07 [INFO] Save tuning history to /home/ubuntu/devcon/nc_workspace/2022-03-21_14-26-55/./history.snapshot.
2022-03-21 14:27:07 [INFO] Specified timeout or max trials is reached! Found a quantized model which meet accuracy goal. Exit.
2022-03-21 14:27:07 [INFO] Save deploy yaml to /home/ubuntu/devcon/nc_workspace/2022-03-21_14-26-55/deploy.yaml


In [46]:
run_benchmark("classification_model_int8_0.1.onnx", is_inc=0)

100%|██████████| 500/500 [00:01<00:00, 261.21it/s]


0.778

##### Quantize model with 0.01% tolerance

In [None]:
input_path='classification_model_fp32.onnx'
quant_config="./quantization.yml"
save_path='classification_model_int8_0.01.onnx'

inc_quantize(input_path,quant_config,save_path,tolerance=0.01)

In [None]:
run_benchmark("classification_model_int8_0.01.onnx", is_inc=0)

##### Create python file for the inference pipeline

In [48]:
%%writefile benchmark.py


import onnxruntime
from datasets import load_dataset, load_metric
import torch
import time
import numpy as np
from transformers import AutoTokenizer
from collections import Counter
from rich import print

dataset = load_dataset("imdb")
pretrained_model="nreimers/MiniLM-L6-H384-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)


def tokenize_function(examples):
    return tokenizer(examples["text"], 
                     padding = "max_length", 
                     truncation = True, 
                     max_length = 16)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(3000))
small_eval_dataset.set_format(type='torch', 
                              columns=['label', 
                                       'input_ids', 
                                       'token_type_ids', 
                                       'attention_mask'])

metric = load_metric("accuracy")

def run_benchmark(model, dataset=small_eval_dataset, is_inc=1):
  if is_inc: 
    ort_session = onnxruntime.InferenceSession(model.SerializeToString(), None)
  else:
    ort_session = onnxruntime.InferenceSession(model)
  total_diff, tot_len = 0, 0
  preds, gt = [], []
  hist = []
  eval_dataloader = torch.utils.data.DataLoader(dataset, batch_size=1)

  for bi, d in enumerate(eval_dataloader):
    start = time.time()
    ids = d["input_ids"]
    mask = d["attention_mask"]

    # ***** For ONNX ********
    ids = ids.cpu().detach().numpy()
    mask = mask.cpu().detach().numpy()
    ort_inputs = {"input_ids" : ids, "attention_mask": mask}
    ort_outs = ort_session.run(None, ort_inputs)
    end = time.time()

    label = d["label"]
    label = (label.cpu().detach().numpy()).item(0)

    ort_outs = torch.tensor(np.array(ort_outs))
    pred = np.argmax(torch.sigmoid(ort_outs).squeeze().cpu().detach().numpy())
    preds.append(pred)
    gt.append(label)

    if bi > 100:
      diff = end - start
      total_diff = total_diff+diff
      tot_len = tot_len+1      
      hist.append(diff*1000)
        
  print("mean:",np.round(np.mean(hist),2),"ms, ",
        "std:",np.round(np.std(hist),2),"ms, ",
        "min:",np.round(np.min(hist),2),"ms, ",
        "max:",np.round(np.max(hist),2),"ms, ",
        "median:",np.round(np.median(hist),2),"ms, ",
        "95p:",np.round(np.percentile(hist,95),2),"ms, ",
        "99p:",np.round(np.percentile(hist,99),2),"ms")
  result = metric.compute(predictions=preds, references=gt)
  acc = result['accuracy']
  return acc
 
accuracy = run_benchmark("classification_model_int8_0.1.onnx", is_inc=0)
print(accuracy)

Overwriting benchmark.py
