# Text generation

Let us first implement the code from the chapter.

## Greedy Search Decoding

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [33]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [3]:
max_length = 128
input_txt = """In a shocking finding, scientist discovered \
a herd of unicorns living in a remote, previously unexplored \
valley, in the Andes Mountains. Even more surprising to the \
researchers was the fact that the unicorns spoke perfect English.\n\n
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output_greedy = model.generate(input_ids, max_length=max_length, do_sample=False)
print(tokenizer.decode(output_greedy[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  position_ids = attention_mask.long().cumsum(-1) - 1
  if unfinished_sequences.max() == 0:


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


"The unicorns were very intelligent, and they were very intelligent," said Dr. David S. Siegel, a professor of anthropology at the University of California, Berkeley. "They were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very


## Beam Search Decoding

In [4]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5, do_sample=False)
print(tokenizer.decode(output_beam[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  input_ids = input_ids.repeat_interleave(expand_size, dim=0)


KeyboardInterrupt: 

In [None]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5, do_sample=False, no_repeat_ngram_size=2)
print(tokenizer.decode(output_beam[0]))

## Sampling methods

In [None]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))

In [None]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))

In [None]:
output_topk = model.generate(input_ids, max_length=max_length, do_sample=True, top_k=50)
print(tokenizer.decode(output_topk[0]))

In [None]:
output_topp = model.generate(input_ids, max_length=max_length, do_sample=True, top_p=0.90)
print(tokenizer.decode(output_topp[0]))

## Fine-tuning GPT2 on the Folk- and Mythology Tales dataset

In [3]:
from datasets import load_dataset

dataset = load_dataset("merve/folk-mythology-tales")

Found cached dataset text (/Users/sivakalyan/.cache/huggingface/datasets/merve___text/merve--folk-mythology-tales-ab941ad4cf81c38a/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

In [4]:
import re

lines = [re.sub(r'^ +', '', line) for line in dataset['train']['text']]
texts_raw = [re.sub(r' {2,4}', '\n\n', x) + tokenizer.special_tokens_map['eos_token'] 
             for x in re.split(' {5,}', ' '.join(lines))]
# print(texts_raw[0])
texts_combined = ''.join(texts_raw)

In [25]:
from datasets import Dataset

tale_dict = {"text": texts_raw}
tales = Dataset.from_dict(tale_dict).train_test_split()

def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_tales = tales.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

lm_tales = tokenized_tales.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

lm_tales

Map (num_proc=4):   0%|          | 0/662 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3562 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1491 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1438 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2684 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/221 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3731 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4992 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (5692 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (24990 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/662 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/221 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 17894
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 5840
    })
})

In [26]:
tokenizer.decode(lm_tales["train"][1]["input_ids"])

" men-at-arms.\n\nAnd now he thought he was lying wide awake where they had laid him, when suddenly he heard a great thundering sound.\n\n'The cobs are coming!' he said.\n\n'They didn't believe a word I told them!\n\nThe cobs'll be carrying off the princess from under their stupid noses!\n\nBut they shan't! that they shan't!'\n\nHe jumped up, as he thought, and began to dress, but, to his dismay, found that he was still lying in bed.\n\n'Now then, I will!'"

In [34]:
training_args = TrainingArguments(
    "folk-mythology-tales-finetuned-gpt2",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    weight_decay=0.01
)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_tales["train"],
    eval_dataset=lm_tales["test"],
)

In [35]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids,attention_mask.