## Intel OpenVINO

In [2]:
model_id = "textattack/roberta-base-SST-2"
seq_len = 256

In [3]:
from functools import partial
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from optimum.intel import OVConfig, OVQuantizer

model = AutoModelForSequenceClassification.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

def preprocess_function(examples, tokenizer):
    return tokenizer(examples["sentence"], padding="max_length", max_length=128, truncation=True)

INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, onnx, openvino


Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [9]:
save_dir = "ptq_model"

In [10]:
# Load the default quantization configuration detailing the quantization we wish to apply
quantization_config = OVConfig()

# Instantiate our OVQuantizer using the desired configuration
quantizer = OVQuantizer.from_pretrained(model)

In [11]:
# Create the calibration dataset used to perform static quantization
calibration_dataset = quantizer.get_calibration_dataset(
    "glue",
    dataset_config_name="sst2",
    preprocess_function=partial(preprocess_function, tokenizer=tokenizer),
    num_samples=300,
    dataset_split="train",
)
# Apply static quantization and export the resulting quantized model to OpenVINO IR format
quantizer.quantize(
    quantization_config=quantization_config,
    calibration_dataset=calibration_dataset,
    save_directory=save_dir,
)
# Save the tokenizer
tokenizer.save_pretrained(save_dir)

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading and preparing dataset glue/sst2 to /home/robertgshaw/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading data:   0%|          | 0.00/7.44M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

Dataset glue downloaded and prepared to /home/robertgshaw/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


Map:   0%|          | 0/300 [00:00<?, ? examples/s]

INFO:nncf:Not adding activation input quantizer for operation: 12 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEmbeddings[embeddings]/NNCFEmbedding[token_type_embeddings]/embedding_0
INFO:nncf:Not adding activation input quantizer for operation: 11 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEmbeddings[embeddings]/NNCFEmbedding[word_embeddings]/embedding_0
INFO:nncf:Not adding activation input quantizer for operation: 3 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEmbeddings[embeddings]/ne_0
INFO:nncf:Not adding activation input quantizer for operation: 4 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEmbeddings[embeddings]/int_0
INFO:nncf:Not adding activation input quantizer for operation: 5 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEmbeddings[embeddings]/cumsum_0
INFO:nncf:Not adding activation input quantizer for operation: 13 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEmbeddin

INFO:nncf:Not adding activation input quantizer for operation: 146 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEncoder[encoder]/ModuleList[layer]/RobertaLayer[4]/RobertaAttention[attention]/RobertaSelfAttention[self]/__add___0
INFO:nncf:Not adding activation input quantizer for operation: 149 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEncoder[encoder]/ModuleList[layer]/RobertaLayer[4]/RobertaAttention[attention]/RobertaSelfAttention[self]/matmul_1
INFO:nncf:Not adding activation input quantizer for operation: 155 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEncoder[encoder]/ModuleList[layer]/RobertaLayer[4]/RobertaAttention[attention]/RobertaSelfOutput[output]/__add___0
INFO:nncf:Not adding activation input quantizer for operation: 156 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEncoder[encoder]/ModuleList[layer]/RobertaLayer[4]/RobertaAttention[attention]/RobertaSelfOutput[output]/NNCFLayerNorm[LayerNorm]/layer_no

INFO:nncf:Not adding activation input quantizer for operation: 306 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEncoder[encoder]/ModuleList[layer]/RobertaLayer[9]/RobertaOutput[output]/__add___0
INFO:nncf:Not adding activation input quantizer for operation: 307 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEncoder[encoder]/ModuleList[layer]/RobertaLayer[9]/RobertaOutput[output]/NNCFLayerNorm[LayerNorm]/layer_norm_0
INFO:nncf:Not adding activation input quantizer for operation: 320 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEncoder[encoder]/ModuleList[layer]/RobertaLayer[10]/RobertaAttention[attention]/RobertaSelfAttention[self]/__add___0
INFO:nncf:Not adding activation input quantizer for operation: 323 RobertaForSequenceClassification/RobertaModel[roberta]/RobertaEncoder[encoder]/ModuleList[layer]/RobertaLayer[10]/RobertaAttention[attention]/RobertaSelfAttention[self]/matmul_1
INFO:nncf:Not adding activation input quantizer for opera

For instance, instead of `compressed_model.get_graph()` you should now write `compressed_model.nncf.get_graph()`.
The old style will be removed after NNCF v2.5.0
  return self._level_low.item()
  return self._level_high.item()


verbose: False, log level: Level.ERROR

NNCF relies on custom-wrapping the `forward` call in order to function properly.
Arbitrary adjustments to the forward function on an NNCFNetwork object have undefined behaviour.
If you need to replace the underlying forward function of the original model so that NNCF should be using that instead of the original forward function that NNCF saved during the compressed model creation, you can do this by calling:
model.nncf.set_original_unbound_forward(fn)
if `fn` has an unbound 0-th `self` argument, or
with model.nncf.temporary_bound_original_forward(fn): ...
if `fn` already had 0-th `self` argument bound or never had it in the first place.


Configuration saved in ptq_model/openvino_config.json


('ptq_model/tokenizer_config.json',
 'ptq_model/special_tokens_map.json',
 'ptq_model/vocab.json',
 'ptq_model/merges.txt',
 'ptq_model/added_tokens.json',
 'ptq_model/tokenizer.json')

## Run OpenVINO

In [22]:
from optimum.intel import OVModelForSequenceClassification

ov_model_int8 = OVModelForSequenceClassification.from_pretrained(save_dir, export=False)
ov_model_fp32 = OVModelForSequenceClassification.from_pretrained(model_id, export=True)

tokenizer = AutoTokenizer.from_pretrained(save_dir)

Compiling the model...
Framework not specified. Using pt to export to ONNX.
Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using framework PyTorch: 2.0.1+cu117
Overriding 1 configuration item(s)
	- use_cache -> False


verbose: False, log level: Level.ERROR



Compiling the model...


In [20]:
sentence = "He is a dreadful magician"
inputs = tokenizer(sentence, max_length=256, padding="max_length", truncation=True, return_tensors="np")

In [24]:
import time
iterations = 100

start = time.perf_counter()
for _ in range(iterations):
    ov_model_fp32(**inputs)
end = time.perf_counter()

print(f"items/sec: {iterations / (end-start)}")

items/sec: 19.90129716437686


In [55]:
!ls

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Untitled.ipynb	compressed_graph.dot  env  original_graph.dot  ptq_model


In [26]:
import time
iterations = 100

start = time.perf_counter()
for _ in range(iterations):
    ov_model_int8(**inputs)
end = time.perf_counter()

print(f"items/sec for: {iterations / (end-start)}")

items/sec for: 41.41784595429765


In [56]:
print(inputs)

{'input_ids': array([[    0,   894,    16,    10, 31715, 38034,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,

In [59]:
!ls ptq_model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
config.json  openvino_config.json  special_tokens_map.json  vocab.json
merges.txt   openvino_model.bin    tokenizer.json
model_cache  openvino_model.xml    tokenizer_config.json


In [62]:
!benchmark_app --help

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
usage: benchmark_app [-h [HELP]] [-i PATHS_TO_INPUT [PATHS_TO_INPUT ...]] -m
                     PATH_TO_MODEL [-d TARGET_DEVICE]
                     [-hint {throughput,tput,cumulative_throughput,ctput,latency,none}]
                     [-niter NUMBER_ITERATIONS] [-t TIME] [-b BATCH_SIZE]
                     [-shape SHAPE] [-data_shape DATA_SHAPE] [-layout LAYOUT]
                     [-extensions EXTENSIONS] [-c PATH_TO_CLDNN_CONFIG]
                     [-cdir CACHE_DIR] [-lfile [LOAD_FROM_FILE]]
                     [-api {sync,async}] [-nireq NUMBER_INFER_REQUESTS]
                     [-nstreams NUMBER_STREAMS]
    

In [70]:
!benchmark_app \
    --path_to_model ptq_model/openvino_model.xml \
    -hint latency \
    -shape "attention_mask[1,256],input_ids[1,256]" \
    -data_shape "attention_mask[1,256],ianput_ids[1,256]"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.0.1-11005-fa1c41994f3-releases/2023/0
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2023.0.1-11005-fa1c41994f3-releases/2023/0
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 53.62 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]
[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] 

In [71]:
!benchmark_app \
    --path_to_model ptq_model/openvino_model.xml \
    -hint tput \
    -shape "attention_mask[1,256],input_ids[1,256]" \
    -data_shape "attention_mask[1,256],ianput_ids[1,256]"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.0.1-11005-fa1c41994f3-releases/2023/0
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2023.0.1-11005-fa1c41994f3-releases/2023/0
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 50.27 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input_ids (node: input_ids) : i64 / [...] / [?,?]
[ INFO ]     attention_mask (node: attention_mask) : i64 / [...] 

In [29]:
ov_model_int8.reshape(1,256)
ov_model_fp32.reshape(1,256)
ov_model_int8.compile()
ov_model_fp32.compile()

Compiling the model...
Compiling the model...


In [32]:
import time
iterations = 100

start = time.perf_counter()
for _ in range(iterations):
    ov_model_fp32(**inputs)
end = time.perf_counter()

print(f"items/sec for: {iterations / (end-start)}")

items/sec for: 20.41257934771766


In [33]:
import time
iterations = 100

start = time.perf_counter()
for _ in range(iterations):
    ov_model_int8(**inputs)
end = time.perf_counter()

print(f"items/sec for: {iterations / (end-start)}")

items/sec for: 42.75287537958746


#### OV HUB MODEL

In [44]:
ov_model_hub_id = "OpenVINO/bert-base-uncased-sst2-int8-unstructured80"
ov_model_hub = OVModelForSequenceClassification.from_pretrained(ov_model_hub_id, export=False)
tokenizer_ov_hub = AutoTokenizer.from_pretrained(ov_model_hub_id)

Compiling the model...


In [47]:
sentence = "He is a dreadful magician"
inputs_ov_hub = tokenizer_ov_hub(sentence, max_length=256, padding="max_length", truncation=True, return_tensors="np")

In [50]:
import time
iterations = 100

start = time.perf_counter()
for _ in range(iterations):
    ov_model_hub(**inputs_ov_hub)
end = time.perf_counter()

print(f"items/sec for: {iterations / (end-start)}")

items/sec for: 41.76662096029358


In [52]:
ov_model_hub.reshape(1,256)
ov_model_hub.compile()

Compiling the model...


In [53]:
import time
iterations = 100

start = time.perf_counter()
for _ in range(iterations):
    ov_model_hub(**inputs_ov_hub)
end = time.perf_counter()

print(f"items/sec for: {iterations / (end-start)}")

items/sec for: 43.87668884660598


## DeepSparse Baseline

In [35]:
!pip install deepsparse

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting deepsparse
  Using cached deepsparse-1.5.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.9 MB)
Collecting pydantic<2.0.0,>=1.8.2
  Using cached pydantic-1.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
Collecting protobuf<=3.20.1,>=3.12.2
  Using cached protobuf-3.20.1-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl (1.0 MB)
Collecting sparsezoo~=1.5.0
  Using cached sparsezoo-1.5.2-py3-none-any.whl (131 kB)
Collecting click!=8.0.0,>=7.1.2
  Using cached click-8.1.4-py3-none-any.whl (98 kB)
Collecting onnx<1.15.0,>=1.5.0
  Using cached onnx-1.14.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (14.6 MB)
  Using cached onnx-1.13.1-cp39-cp39-man

In [36]:
from deepsparse import Engine

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [38]:
ds_model_base = Engine(
    "zoo:nlp/sentiment_analysis/oberta-base/pytorch/huggingface/sst2/base-none",
    batch_size=1,
    input_shapes=[[1,256], [1,256]]
)

In [40]:
ds_model_quant = Engine(
    "zoo:nlp/sentiment_analysis/oberta-base/pytorch/huggingface/sst2/base_quant-none",
    batch_size=1,
    input_shapes=[[1,256], [1,256]]
)

In [39]:
ds_model_pruned_quant = Engine(
    "zoo:nlp/sentiment_analysis/oberta-base/pytorch/huggingface/sst2/pruned90_quant-none",
    batch_size=1,
    input_shapes=[[1,256], [1,256]]
)

In [41]:
ds_models = {
    "base": ds_model_base,
    "quant": ds_model_quant,
    "pruned-quant": ds_model_pruned_quant
}

In [46]:
import time
iterations = 100

for model_type in ds_models:
    ds_model = ds_models[model_type]
    ds_input = [inputs["input_ids"], inputs["attention_mask"]]

    start = time.perf_counter()
    for _ in range(iterations):
        ds_model(ds_input)
    end = time.perf_counter()

    print(f"items/sec for {model_type}: {iterations / (end-start)}")

items/sec for base: 18.29179157897903
items/sec for quant: 55.511120014202206
items/sec for pruned-quant: 75.53615177183471


In [72]:
!deepsparse.benchmark zoo:nlp/sentiment_analysis/oberta-base/pytorch/huggingface/sst2/pruned90_quant-none -i [1,256] -nstreams 4

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2023-07-07 21:56:25 deepsparse.benchmark.benchmark_model INFO     Thread pinning to cores enabled
2023-07-07 21:56:26 deepsparse.benchmark.benchmark_model INFO     num_streams reduced to 1 for singlestream scenario.
INFO:deepsparse.benchmark.benchmark_model:num_streams reduced to 1 for singlestream scenario.
DeepSparse, Copyright 2021-present / Neuralmagic, Inc. version: 1.5.2 COMMUNITY | (93c38382) (release) (optimized) (system=avx512_vnni, binary=avx512)
2023-07-07 21:56:41 deepsparse.benchmark.benchmark_model INFO     deepsparse.engine.Engine:
	onnx_file_path: /home/robertgshaw/.cache/sparsezoo/d4496866-ceee-4ea3-936d-fe3bb1dae5cd/model.onnx
	batch_size: 1
	num_cores: 8
	num_streams: 1
	scheduler: Schedul