In order to run this notebook, you will need to install the huggingface library with the following command: `pip install transformers`

In [1]:
from transformers import T5ForConditionalGeneration, T5Tokenizer, BartForConditionalGeneration, BartTokenizer, GPT2LMHeadModel, GPT2Tokenizer
from torchtext.prototype.generate import GenerationUtil

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
t5 = T5ForConditionalGeneration.from_pretrained("t5-base")
bart = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn")
gpt2 = GPT2LMHeadModel.from_pretrained("gpt2")

In [5]:
# Testing Huggingface's T5
test_sequence = ["summarize: studies have shown that owning a dog is good for you"]
generative_hf_t5 = GenerationUtil(t5, is_encoder_decoder=True, is_huggingface_model=True)
t5_tokenizer = T5Tokenizer.from_pretrained("t5-base")
test_sequence_tk = t5_tokenizer(test_sequence, return_tensors="pt").input_ids
tokens = generative_hf_t5.generate(test_sequence_tk, max_len=20, pad_idx=t5.config.pad_token_id)
print(t5_tokenizer.batch_decode(tokens, skip_special_tokens=True))

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


['owning a dog is good for you, according to studies. a dog is']


In [6]:
# Testing HuggingFace's T5 w/ Beam Search
tokens = generative_hf_t5.generate(test_sequence_tk, max_len=100, pad_idx=t5.config.pad_token_id, num_beams=5, beam_size_token=t5.config.vocab_size)
print(t5_tokenizer.batch_decode(tokens, skip_special_tokens=True))

['a dog is good for you. studies have shown that dog ownership is good for your overall health and well-being.']


In [7]:
# Testing Decoding Speed HuggingFace's T5 w/ TorchText Beam Search vs. HuggingFace Beam Search
import time

start = time.time()
tokens = generative_hf_t5.generate(test_sequence_tk, max_len=100, pad_idx=t5.config.pad_token_id, num_beams=5, beam_size_token=t5.config.vocab_size)
end = time.time()
print(t5_tokenizer.batch_decode(tokens, skip_special_tokens=True), end - start)

start = time.time()
tokens = t5.generate(test_sequence_tk, max_length=100, num_beams=5, do_sample=False)
end = time.time()
print(t5_tokenizer.batch_decode(tokens, skip_special_tokens=True), end - start)

['a dog is good for you. studies have shown that dog ownership is good for your overall health and well-being.'] 9.786320924758911
['studies have shown that owning a dog is good for you. studies have shown that owning a dog is good for you.'] 1.3000121116638184


In [8]:
# Testing Huggingface's BART
test_sequence = ["PG&E stated it scheduled the blackouts in response to forecasts for high winds "
    "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
    "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."]
generative_hf_bart = GenerationUtil(bart, is_encoder_decoder=True, is_huggingface_model=True)
bart_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")
test_sequence_tk = bart_tokenizer(test_sequence, return_tensors="pt").input_ids
tokens = generative_hf_bart.generate(test_sequence_tk, max_len=20, pad_idx=bart.config.pad_token_id)
print(bart_tokenizer.batch_decode(tokens, skip_special_tokens=True))

['PG. PG&E said it scheduled the blackouts in response to forecasts for high winds.']


In [9]:
tokens = generative_hf_bart.generate(test_sequence_tk, max_len=20, pad_idx=bart.config.pad_token_id, num_beams=5, beam_size_token=bart.config.vocab_size)
print(bart_tokenizer.batch_decode(tokens, skip_special_tokens=True))


['Nearly. PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions.']


In [10]:
# Testing Decoding Speed HuggingFace's BART w/ TorchText Beam Search vs. HuggingFace Beam Search
import time

start = time.time()
tokens = generative_hf_bart.generate(test_sequence_tk, max_len=100, pad_idx=t5.config.pad_token_id, num_beams=5, eos_score=1.0, beam_size_token=t5.config.vocab_size)
end = time.time()
print(bart_tokenizer.batch_decode(tokens, skip_special_tokens=True), end - start)

start = time.time()
tokens = bart.generate(test_sequence_tk, max_length=100, num_beams=5, do_sample=False)
end = time.time()
print(bart_tokenizer.batch_decode(tokens, skip_special_tokens=True), end - start)

['PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs. The blackouts are expected to last through at least midday tomorrow. to be affected by the shutoffs which were expected to last through at least midday tomorrow. to be affected by the shutoffs which were expected to last through at least midday tomorrow. to be affected by the'] 58.09997892379761
['PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs. The blackouts were expected to last through at least midday tomorrow.'] 2.456479787826538


In [3]:
# Testing Huggingface's GPT2
test_sequence = ["I enjoy walking with my cute dog"]
generative_hf_gpt2 = GenerationUtil(gpt2, is_encoder_decoder=False, is_huggingface_model=True)
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
test_sequence_tk = gpt2_tokenizer(test_sequence, return_tensors="pt").input_ids
tokens = generative_hf_gpt2.generate(test_sequence_tk, max_len=20, pad_idx=gpt2.config.pad_token_id)
print(gpt2_tokenizer.batch_decode(tokens, skip_special_tokens=True))

["I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to"]


In [4]:
tokens = generative_hf_gpt2.generate(test_sequence_tk, max_len=20, pad_idx=gpt2.config.pad_token_id, num_beams=5, beam_size_token=gpt2.config.vocab_size)
print(gpt2_tokenizer.batch_decode(tokens, skip_special_tokens=True))

['I enjoy walking with my cute dog," says Kelli Williams-Petersen. The dog loves it so much, that when she']
