In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from transformers import pipeline
from transformers import BartForConditionalGeneration, BartTokenizer, PreTrainedTokenizerFast

In [2]:
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')

In [3]:
model_big = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
model_small = BartForConditionalGeneration.from_pretrained("ainize/bart-base-cnn")

Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# print the number of parameters of the model
print(f"Number of parameters in model_big: {model_big.num_parameters()}")
print(f"Number of parameters in model_small: {model_small.num_parameters()}")
print(f"Big model is {model_big.num_parameters()/model_small.num_parameters()} times larger than the small model")

Number of parameters in model_big: 406290432
Number of parameters in model_small: 139420416
Big model is 2.914138715523557 times larger than the small model


In [5]:
# compare the inference time of the two models
def inference_time(model, tokenized_text):
    with torch.no_grad():
        input_ids = torch.tensor(tokenized_text['input_ids'])
        attention_mask = torch.tensor(tokenized_text['attention_mask'])
        start = torch.cuda.Event(enable_timing=True)
        end = torch.cuda.Event(enable_timing=True)
        start.record()
        outputs = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=512)
        end.record()
        torch.cuda.synchronize()
        return start.elapsed_time(end)

In [6]:
# convert the text to a batch (list)
data = pd.read_csv('../datasets/cnn_dailymail/test.csv')
text = data['article'].tolist()
text = text[:100]

In [7]:
# convert to batches
batch_size = 1
# convert to list of lists
batches = [text[i:i + batch_size] for i in range(0, len(text), batch_size)]

In [8]:
for i in tqdm(range(len(batches))):
    batches[i] = tokenizer(batches[i], truncation=True, padding=True, max_length=1024, return_tensors="pt")

100%|██████████| 100/100 [00:00<00:00, 131.21it/s]


In [9]:
# move to GPU
device_0 = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device_1 = torch.device('cuda:1' if torch.cuda.device_count() > 1 else 'cpu')
# device_0 = torch.device('cpu')
# device_1 = torch.device('cpu')
model_big.to(device_0)
model_small.to(device_1)


BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05,

In [10]:
# pass the batches to the models and measure the inference time
inference_times_big = []
inference_times_small = []
for i in tqdm(range(len(batches))):
    # batch = tokenizer(batches[i], truncation=True, padding=True, max_length=1024, return_tensors="pt")
    batch = batches[i]
    batch = {k: v.to(device_0) for k, v in batch.items()}
    inference_times_big.append(inference_time(model_big, batch))
    batch = {k: v.to(device_1) for k, v in batch.items()}
    inference_times_small.append(inference_time(model_small, batch))

  after removing the cwd from sys.path.
  """
100%|██████████| 100/100 [03:06<00:00,  1.86s/it]


In [12]:
print(f"Mean inference time for big model: {np.mean(inference_times_big)} ms")
print(f"Mean inference time for small model: {np.mean(inference_times_small)} ms")
print(f"Big model is {np.mean(inference_times_big)/np.mean(inference_times_small)} times slower than the small model")

Mean inference time for big model: 1180.8170031738282 ms
Mean inference time for small model: 679.6507550048829 ms
Big model is 1.7373879076546377 times slower than the small model
