# NLP Transformers Search and Sampling

In [None]:
!pip install transformers

In [None]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer


tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# add the EOS token as PAD token to avoid warnings
model = TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)

**Greedy search**

In [4]:
# encode context the generation is conditioned on
input_ids = tokenizer.encode('I am an avid', return_tensors='tf')

# generate text until the output length (which includes the context length) reaches 50
greedy_output = model.generate(input_ids, max_length=50)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I am an avid reader of the Bible and I have been reading it for years. I have read many books and have read many books on the subject of the Bible. I have read many books on the subject of the Bible. I have read many


**Beam Search**

In [5]:
# activate beam search and early_stopping
beam_output = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I am an avid gamer and I love to play video games. I love to play video games and I love to play video games. I love to play video games and I love to play video games. I love to play video games and I love


In [6]:
# set no_repeat_ngram_size to 2
beam_output = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    early_stopping=True
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I am an avid gamer and I love to play video games. I have been playing games since I was a little kid, but I've never played a video game before.

I've always wanted to be a gamer, so I decided to


In [7]:
# set return_num_sequences > 1
beam_outputs = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    num_return_sequences=5, 
    early_stopping=True
)

# now we have 3 output sequences
print("Output:\n" + 100 * '-')
for i, beam_output in enumerate(beam_outputs):
  print("{}: {}".format(i, tokenizer.decode(beam_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: I am an avid gamer and I love to play video games. I have been playing games since I was a little kid, but I've never played a video game before.

I've always wanted to be a gamer, so I decided to
1: I am an avid gamer and I love to play video games. I have been playing games since I was a little kid, but I've never played a video game before.

I've always wanted to be a professional gamer, so I decided
2: I am an avid gamer and I love to play video games. I have been playing games since I was a little kid, but I've never played a video game before.

I've always wanted to be a professional gamer, so I started
3: I am an avid gamer and I love to play video games. I have been playing games since I was a little kid, but I've never played a video game before.

I've always wanted to be a professional gamer, so when I
4: I am an avid gamer and I love to play video games. I have bee

**Top-K Sampling**

In [8]:
# set seed to reproduce results. Feel free to change the seed though to get different results
tf.random.set_seed(0)

# activate sampling and deactivate top_k by setting top_k sampling to 0
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I am an avid sports fan and have been for over 10 years now. I have never been mixed with a creature and while I don't dislike them, I don't approve of their behavior. I could not be more wrong. I have been mixed


In [9]:
# set seed to reproduce results. Feel free to change the seed though to get different results
tf.random.set_seed(0)

# use temperature to decrease the sensitivity to low probability candidates
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=0, 
    temperature=0.7
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I am an avid reader of and a person of great love. He has been a friend of mine for a long time and I am very thankful for that. I would love to know if he has any other experience of this or any other. He


In [10]:
# set seed to reproduce results. Feel free to change the seed though to get different results
tf.random.set_seed(0)

# set top_k to 50
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_k=50
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I am an avid reader of his works, and he is the writer for The Unwritten, which is a collection of essays, essays, poems, and short stories of varying degrees of importance to the community.

I have also been a volunteer


**Top-p Sampling**

In [11]:
# set seed to reproduce results. Feel free to change the seed though to get different results
tf.random.set_seed(0)

# deactivate top_k sampling and sample only from 92% most likely words
sample_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_length=50, 
    top_p=0.92, 
    top_k=0
)

print("Output:\n" + 100 * '-')
print(tokenizer.decode(sample_output[0], skip_special_tokens=True))

Output:
----------------------------------------------------------------------------------------------------
I am an avid dietitian. I have found that I enjoy eating well, not just healthy.

But what I don't realize is that I did not celebrate my European wedding this year.

I was so excited to do so


**Top-K with Top-p Sampling**

In [12]:
# set seed to reproduce results. Feel free to change the seed though to get different results
tf.random.set_seed(0)

# set top_k = 50 and set top_p = 0.95 and num_return_sequences = 3
sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=50, 
    top_k=50, 
    top_p=0.95, 
    num_return_sequences=3
)

print("Output:\n" + 100 * '-')
for i, sample_output in enumerate(sample_outputs):
  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Output:
----------------------------------------------------------------------------------------------------
0: I am an avid reader of JK Rowling. I have been wanting to read a lot of books for a while and had a lot of thoughts on the book and on Harry Potter. I am also a big fan of the book and I do try
1: I am an avid sports blogger (yes I am), we always do our own searches, and when a product is relevant for us, we take it down and re-write all our reviews. I do this so that we can post our favorites to
2: I am an avid shooter and this game has brought me back in a big way for the first time in my life, so I can't wait to get on with the next step."

The new edition of the shooter's sequel "Gun City
