# Question Generation

In [253]:
from datasets import load_dataset
from transformers import AutoTokenizer, TFT5ForConditionalGeneration
import tensorflow as tf
import numpy as np
import pandas as pd

squad_v2_data = load_dataset("squad_v2")
train_set = squad_v2_data["train"][:500]
val_set = squad_v2_data["validation"][:1000]

In [248]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = TFT5ForConditionalGeneration.from_pretrained("t5-small")

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [217]:
# First, take a test context for which we will consider one answerable question, and one unanswerable question.
test_context = train_set["context"][2075]
print(len(test_context))
test_context

705


'The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]'

In [218]:
train_set["context"][2074]

'The Legend of Zelda: Twilight Princess (Japanese: ゼルダの伝説 トワイライトプリンセス, Hepburn: Zeruda no Densetsu: Towairaito Purinsesu?) is an action-adventure game developed and published by Nintendo for the GameCube and Wii home video game consoles. It is the thirteenth installment in the The Legend of Zelda series. Originally planned for release on the GameCube in November 2005, Twilight Princess was delayed by Nintendo to allow its developers to refine the game, add more content, and port it to the Wii. The Wii version was released alongside the console in North America in November 2006, and in Japan, Europe, and Australia the following month. The GameCube version was released worldwide in December 2006.[b]'

In [219]:
test_question = train_set["question"][2074]
test_question

'What year was the Wii version of Legend of Zelda: Twilight Princess released?'

In [220]:
train_set_df = pd.DataFrame(train_set)

unanswerable_mask = (train_set_df["answers"] == {'text': [], 'answer_start': []})
train_set_df[unanswerable_mask]

Unnamed: 0,id,title,context,question,answers
2075,5a8d7bf7df8bba001a0f9ab1,The_Legend_of_Zelda:_Twilight_Princess,The Legend of Zelda: Twilight Princess (Japane...,What category of game is Legend of Zelda: Aust...,"{'text': [], 'answer_start': []}"
2076,5a8d7bf7df8bba001a0f9ab2,The_Legend_of_Zelda:_Twilight_Princess,The Legend of Zelda: Twilight Princess (Japane...,What consoles can be used to play Australia Tw...,"{'text': [], 'answer_start': []}"
2077,5a8d7bf7df8bba001a0f9ab3,The_Legend_of_Zelda:_Twilight_Princess,The Legend of Zelda: Twilight Princess (Japane...,When was Australia Twilight launched in North ...,"{'text': [], 'answer_start': []}"
2078,5a8d7bf7df8bba001a0f9ab4,The_Legend_of_Zelda:_Twilight_Princess,The Legend of Zelda: Twilight Princess (Japane...,When could GameCube owners purchase Australian...,"{'text': [], 'answer_start': []}"
2079,5a8d7bf7df8bba001a0f9ab5,The_Legend_of_Zelda:_Twilight_Princess,The Legend of Zelda: Twilight Princess (Japane...,What year was the Legend of Zelda: Australian ...,"{'text': [], 'answer_start': []}"
...,...,...,...,...,...
2896,5ad247c0d7d075001a428b4f,Spectre_(2015_film),"In India, it was reported that the Indian Cent...",Where was support expressed for the censored f...,"{'text': [], 'answer_start': []}"
2900,5ad24827d7d075001a428b54,Spectre_(2015_film),A sequel to Spectre will begin development in ...,A prequel to Spectre will begin when?,"{'text': [], 'answer_start': []}"
2901,5ad24827d7d075001a428b55,Spectre_(2015_film),A sequel to Spectre will begin development in ...,Who will return to direct the next 007 film?,"{'text': [], 'answer_start': []}"
2902,5ad24827d7d075001a428b56,Spectre_(2015_film),A sequel to Spectre will begin development in ...,Who has signed on for three more films in the ...,"{'text': [], 'answer_start': []}"


In [221]:
test_question_impossible = train_set["question"][2075]
test_question_impossible

'What category of game is Legend of Zelda: Australia Twilight?'

In [222]:
len(test_question)

77

In [251]:
# Manually tokenizer the context twice this time to make the inputs equal shape
inputs_tokens = tokenizer([test_context, test_context], return_tensors="tf")
inputs = input_tokens.input_ids
inputs_attention_mask = input_tokens.attention_mask

labels_tokens = tokenizer(test_questions, return_tensors="tf", max_length=40, padding="max_length")
labels = labels_tokens.input_ids
# Test ignoring questions selectively by manually assigning a penalty to the attention mask of the unanswerable question
labels_attention_mask_good = labels_tokens.attention_mask[0]
labels_attention_mask_bad = labels_tokens.attention_mask[1]
labels_attention_mask = tf.stack([labels_attention_mask_good, labels_attention_mask_bad], axis=0)

train_data = {}
train_data["input_ids"] = tf.expand_dims(inputs, 0)
train_data["attention_mask"] = tf.expand_dims(inputs_attention_mask, 0)
train_data["labels"] = tf.expand_dims(labels, 0)
train_data["decoder_attention_mask"] = tf.expand_dims(labels_attention_mask, 0)
dataset = tf.data.Dataset.from_tensor_slices(train_data)

model.compile()

history = model.fit(dataset, epochs=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [252]:
inputs = tokenizer(test_context, return_tensors="tf",  max_length=694, truncation=True, padding="max_length").input_ids
outputs = model.generate(inputs, num_beams=5, num_return_sequences=5, do_sample=False)

for i in range(5):
    print(tokenizer.decode(outputs[i], skip_special_tokens=True))

What year was the Wii version of Legend of Zelda: Australia Twilight?
What category of game is Legend of Zelda: Australia Twilight?
What year was the Wii version of Legend of Zelda: Australian Twilight?
What month was the Wii version of Legend of Zelda: Australia Twilight?



## Train on more data

In [254]:
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = TFT5ForConditionalGeneration.from_pretrained("t5-small")

contexts = train_set["context"]
questions = train_set["question"]

contexts_max_len = max([len(context.split()) for context in contexts])
questions_max_len = max([len(question.split()) for question in questions])

print(contexts_max_len, questions_max_len)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


326 22


In [255]:
import pandas as pd
len(pd.Series(contexts).unique())

43

In [256]:
[s[:200] for s in pd.Series(contexts).unique()]

['Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in v',
 'Following the disbandment of Destiny\'s Child in June 2005, she released her second solo album, B\'Day (2006), which contained hits "Déjà Vu", "Irreplaceable", and "Beautiful Liar". Beyoncé also venture',
 'A self-described "modern-day feminist", Beyoncé creates songs that are often characterized by themes of love, relationships, and monogamy, as well as female sexuality and empowerment. On stage, her dy',
 'Beyoncé Giselle Knowles was born in Houston, Texas, to Celestine Ann "Tina" Knowles (née Beyincé), a hairdresser and salon owner, and Mathew Knowles, a Xerox sales manager. Beyoncé\'s name is a tribute',
 "Beyoncé attended St. Mary's Elementary School in Fredericksburg, Texas, where she enrolled in dance classes. Her singing talent was discovered when dance instructor Darlette J

In [13]:
%%time
# Takes some time!

inputs = tokenizer(contexts, return_tensors="tf", max_length=contexts_max_len, padding="max_length", truncation=True).input_ids
labels = tokenizer(questions, return_tensors="tf", max_length=questions_max_len, padding="max_length", truncation=True).input_ids

train_data = {}
train_data["input_ids"] = inputs
train_data["labels"] = labels



CPU times: user 16.4 s, sys: 2.07 s, total: 18.5 s
Wall time: 3.36 s


In [14]:
from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.optimizers.schedules import ExponentialDecay

initial_learning_rate = 0.001 # start with default Adam value

lr_schedule = ExponentialDecay(
    # Every 5000 iterations, multiply the learning rate by 0.7
    initial_learning_rate, decay_steps = 50000, decay_rate = 0.7,
)

dataset = tf.data.Dataset.from_tensor_slices(train_data).batch(8)
val_dataset = tf.data.Dataset.from_tensor_slices(val_data).batch(8)

adamw = AdamW(weight_decay=0.04, learning_rate=lr_schedule)

model.compile(optimizer=adamw)

We don't necessarily need to use validation, as it is not important that the generated questions match **exactly** with those in the validation set.

In [15]:
history = model.fit(dataset, epochs=4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [3]:
#model.save_weights("/home/rob/question-generation-checkpoint-#######.h5")
model.load_weights("/home/rob/question-generation-checkpoint-29-11-2023.h5")

In [9]:
n_sequences = 10
test_inputs = tokenizer(test_context, return_tensors="tf").input_ids
test_outputs = model.generate(test_inputs, num_beams=20, num_return_sequences=n_sequences, do_sample=False)
for i in range(n_sequences):
    print(tokenizer.decode(test_outputs[i], skip_special_tokens=True))

What was the name of Beyoncé's debut album?
How many Grammy Awards did Beyoncé win?
Who was the lead singer of Destiny's Child?
How many Grammy Awards did Beyoncé win at Dangerously in Love?
How many Grammy Awards did Beyoncé win over Dangerously in Love?
What was Beyoncé's father's father?
What was Beyoncé's first album?
Who was Beyoncé's lead singer?
Who was Beyoncé's mother?
What was the name of Beyoncé's first album?


In [5]:
n_sequences = 20
#test_context_2 = """Johann Sebastian Bach[n 2] (31 March [O.S. 21 March] 1685 – 28 July 1750) was a German composer and musician of the late Baroque period. He is known for his orchestral music such as the Brandenburg Concertos; instrumental compositions such as the Cello Suites; keyboard works such as the Goldberg Variations and The Well-Tempered Clavier; organ works such as the Schubler Chorales and the Toccata and Fugue in D minor; and vocal music such as the St Matthew Passion and the Mass in B minor. Since the 19th-century Bach revival, he has been generally regarded as one of the greatest composers in the history of Western music."""
test_context_2 = """
Ariana Grande-Butera (/ˌɑːriˈɑːnə ˈɡrɑːndeɪ bjʊˈtɛərə/ AR-ee-AH-nə GRAHN-day byuu-TAIR-ə;[note 1] born June 26, 1993) is an American singer, songwriter, and actress. An influential figure in contemporary popular music, and often regarded as a pop culture icon, she is noted for her four-octave vocal range and whistle register that has garnered critical acclaim. Grande has received numerous accolades throughout her career, including two Grammy Awards, one Brit Award, one Bambi Award, two Billboard Music Awards, three American Music Awards, nine MTV Video Music Awards, and 30 Guinness World Records.

Grande began her music career at age 15 in the 2008 Broadway musical 13. She rose to fame for playing Cat Valentine in the Nickelodeon television series Victorious (2010–2013) and Sam & Cat (2013–2014). Grande signed with Republic Records in 2011 after label executives viewed YouTube videos of her covering songs. Her 1950s doo-wop-influenced pop and R&B debut album,[2] Yours Truly (2013), topped the US Billboard 200, while its lead single, "The Way", reached the top ten of the US Billboard Hot 100. Grande's voice and vocal performances on the album drew immediate comparisons to Mariah Carey."""
test_inputs_2 = tokenizer(test_context_2, return_tensors="tf").input_ids
test_outputs_2 = model.generate(test_inputs_2, num_beams=20, num_return_sequences=n_sequences, do_sample=False)

for i in range(n_sequences):
    print(tokenizer.decode(test_outputs_2[i], skip_special_tokens=True))

2023-11-29 15:58:47.142224: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x5617e5f89620 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2023-11-29 15:58:47.142258: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 3070 Ti Laptop GPU, Compute Capability 8.6
2023-11-29 15:58:47.146026: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2023-11-29 15:58:47.155519: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8700
2023-11-29 15:58:47.199888: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


What was the top ten of the Billboard Hot 100?
What was the top ten of the Billboard Hot 100' Billboard Hot 100?
How many Grammy Awards did Grande play in the Nickelodeon television series Victorious?
When did Grande become fame for playing Cat Valentine in the Nickelodeon television series?
How many Grammy Awards did Grande play in the Nickelodeon series Victorious?
How many Grammy Awards did Grande win in 2008?
Who was the lead single, "The Way"?
When did Grande become fame for playing Cat Valentine in the Nickelodeon show?
What was the name of the lead single, "The Way"?
What was the top ten of the Billboard Hot 100 during this period?
When did Grande become fame for playing Cat Valentine?
What year did Grande become fame for playing Cat Valentine?
What year did Grande become fame for playing Cat Valentine in the Nickelodeon series?
What was the top ten of the Billboard Hot 100 on the album?
When did Grande become fame for playing Cat Valentine in the Nickelodeon series?
What was th

In [6]:
n_sequences = 20
#test_context_2 = """Johann Sebastian Bach[n 2] (31 March [O.S. 21 March] 1685 – 28 July 1750) was a German composer and musician of the late Baroque period. He is known for his orchestral music such as the Brandenburg Concertos; instrumental compositions such as the Cello Suites; keyboard works such as the Goldberg Variations and The Well-Tempered Clavier; organ works such as the Schubler Chorales and the Toccata and Fugue in D minor; and vocal music such as the St Matthew Passion and the Mass in B minor. Since the 19th-century Bach revival, he has been generally regarded as one of the greatest composers in the history of Western music."""
test_context_2 = """
Alkenes are reactive and so are useful for making many other substances including polymers.

Polymers have very large molecules. They are formed when many small molecules join together. This process is called polymerisation.

 When alkenes join together to form a polymer with no other substance being produced in the reaction, the process is called addition polymerisation.

Plastics are polymers and are made by polymerisation.

Example: poly(ethene) (often called polythene) is made by polymerising the simplest alkene, ethene.


"""
test_inputs_2 = tokenizer(test_context_2, return_tensors="tf").input_ids
test_outputs_2 = model.generate(test_inputs_2, num_beams=n_sequences, num_return_sequences=n_sequences, do_sample=False)

for i in range(n_sequences):
    print(tokenizer.decode(test_outputs_2[i], skip_special_tokens=True))

What type of polymers are reactive and useful for making many other substances including polymers?
Polymers are reactive and useful for making many other substances such as what?
What are polymers usually reactive and useful for making many other substances such as polymers?
What is an example of polythene polymerising polythene?
What is an example of polymerizing polythene?
What type of polymers are reactive and so they are useful for making many other substances?
What is an example of polymerising polythene?
What is an example of polythene polythene?
Polymers are reactive and so they are useful for making many other substances such as what?
What are polymers used for?
Polymers are reactive and so useful for making many other substances such as what?
What is an example of polythene?
Polymers are reactive and so they are useful for making many other substances including what?
What is an example of polythene polymers?
What is the polymering process used for?
What is an example of polyme

In [12]:
input_ids

NameError: name 'input_ids' is not defined