# Installing the Requirements

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m44.6 MB/s[0m eta [36m0:00:0

# Importing the libraries

In [3]:
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import time

## Check the type of device we are using

In [4]:
# Check if CUDA (GPU) is available
use_cuda = torch.cuda.is_available()

# Set the device to either CUDA (GPU) or CPU based on availability
device = torch.device("cuda" if use_cuda else "cpu")

# Print information about the selected device
if use_cuda:
    print("Using CUDA (GPU)")
else:
  print("Using CPU")

Using CPU


# Loading the base model over here

In [5]:
# Load the base RoBERTa model
base_model = RobertaForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")
tokenizer = RobertaTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-emotion")

Downloading (…)lve/main/config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

# Providing the input for checking the base model latency

In [18]:
# Define your input text and process it with the tokenizer
input_text = "It's been a long day without you my friend"
input_ids = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")

# Fetching the latency of base model

In [19]:
# Measure the initial model's latency
start_time = time.time()
_ = base_model(input_ids)
initial_latency = time.time() - start_time

# Calculating the latency of the base model

In [20]:
# Print initial metrics (latency, GPU memory)
print(f"Initial Latency: {initial_latency} seconds")

Initial Latency: 0.2400679588317871 seconds


# Converting the base model into quantized model

In [21]:
import torch
from transformers import RobertaForSequenceClassification

# Quantize the model
quantized_model = torch.quantization.quantize_dynamic(base_model, {torch.nn.Linear}, dtype=torch.float16)



# Providing the input for checking the quantized model latency

In [22]:
# Define your input text and process it with the tokenizer (as shown previously)
input_text = "I love infrared product."
input_ids = tokenizer.encode(input_text, add_special_tokens=True, return_tensors="pt")

# Fetching the latency of quantized model

In [23]:
# # Warm-up the model (optional but recommended)
# with torch.no_grad():
#     _ = quantized_model(input_ids)

In [24]:
# Measure the quantized model's latency for a single inference
start_time = time.time()
with torch.no_grad():
    _ = quantized_model(input_ids)
latency = time.time() - start_time

# Calculating the latency of the quantized model

In [25]:
print(f"Latency of Quantized Model for Single Inference: {latency} seconds")

Latency of Quantized Model for Single Inference: 0.08132314682006836 seconds


# Comparing the parameters of base and quantized model

In [27]:
# Get the model sizes (number of parameters) for both base and quantized models
base_model_size = sum(p.numel() for p in base_model.parameters())
quantized_model_size = sum(p.numel() for p in quantized_model.parameters())

# Print the model sizes
print(f"Number of Parameters in Base Model: {base_model_size}")
print(f"Number of Parameters in Quantized Model: {quantized_model_size}")

Number of Parameters in Base Model: 124648708
Number of Parameters in Quantized Model: 39037440
