# Text generation

Let us first implement the code from the chapter.

## Greedy Search Decoding

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [3]:
max_length = 128
input_txt = """In a shocking finding, scientist discovered \
a herd of unicorns living in a remote, previously unexplored \
valley, in the Andes Mountains. Even more surprising to the \
researchers was the fact that the unicorns spoke perfect English.\n\n
"""
input_ids = tokenizer(input_txt, return_tensors="pt")["input_ids"].to(device)
output_greedy = model.generate(input_ids, max_length=max_length, do_sample=False)
print(tokenizer.decode(output_greedy[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  position_ids = attention_mask.long().cumsum(-1) - 1
  if unfinished_sequences.max() == 0:


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


"The unicorns were very intelligent, and they were very intelligent," said Dr. David S. Siegel, a professor of anthropology at the University of California, Berkeley. "They were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very intelligent, and they were very


## Beam Search Decoding

In [4]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5, do_sample=False)
print(tokenizer.decode(output_beam[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
  input_ids = input_ids.repeat_interleave(expand_size, dim=0)


In a shocking finding, scientist discovered a herd of unicorns living in a remote, previously unexplored valley, in the Andes Mountains. Even more surprising to the researchers was the fact that the unicorns spoke perfect English.


The researchers, from the University of California, San Diego, and the University of California, Santa Cruz, found that the unicorns were able to communicate with each other in a way that was similar to that of human speech.


"The unicorns were able to communicate with each other in a way that was similar to that of human speech," said study co-lead author Dr. David J.


In [None]:
output_beam = model.generate(input_ids, max_length=max_length, num_beams=5, do_sample=False, no_repeat_ngram_size=2)
print(tokenizer.decode(output_beam[0]))

## Sampling methods

In [None]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=2.0, top_k=0)
print(tokenizer.decode(output_temp[0]))

In [None]:
output_temp = model.generate(input_ids, max_length=max_length, do_sample=True, temperature=0.5, top_k=0)
print(tokenizer.decode(output_temp[0]))

In [None]:
output_topk = model.generate(input_ids, max_length=max_length, do_sample=True, top_k=50)
print(tokenizer.decode(output_topk[0]))

In [None]:
output_topp = model.generate(input_ids, max_length=max_length, do_sample=True, top_p=0.90)
print(tokenizer.decode(output_topp[0]))

## Fine-tuning GPT2 on the Folk- and Mythology Tales dataset

In [5]:
from datasets import load_dataset

dataset = load_dataset("merve/folk-mythology-tales")

Found cached dataset text (/Users/sivakalyan/.cache/huggingface/datasets/merve___text/merve--folk-mythology-tales-ab941ad4cf81c38a/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/1 [00:00<?, ?it/s]

In [6]:
import re

lines = [re.sub(r'^ +', '', line) for line in dataset['train']['text']]
texts_raw = [re.sub(r' {2,4}', '\n\n', x) + tokenizer.special_tokens_map['eos_token'] 
             for x in re.split(' {5,}', ' '.join(lines))]
# print(texts_raw[0])
texts_combined = ''.join(texts_raw)

In [7]:
from datasets import Dataset

tale_dict = {"text": texts_raw}
tales = Dataset.from_dict(tale_dict).train_test_split()

def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_tales = tales.map(tokenize_function, batched=True, num_proc=4, remove_columns=["text"])

block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

lm_tales = tokenized_tales.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

lm_tales

Map (num_proc=4):   0%|          | 0/662 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3520 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2136 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (6638 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2256 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/221 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (8398 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1719 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (5891 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2027 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/662 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/221 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 18374
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 5361
    })
})

In [8]:
tokenizer.decode(lm_tales["train"][1]["input_ids"])

'.\n\nOne day the Giant came back.\n\nHe had been to visit his friend the Cornish ogre, and had stayed with him for seven years.\n\nAfter the seven years were over he had said all that he had to say, for his conversation was limited, and he determined to return to his own castle.\n\nWhen he arrived he saw the children playing in the garden.\n\n"What are you doing here?" he cried in a very gruff voice, and the children ran away.\n\n"My own garden is my own garden," said the Giant; "any one can understand that, and'

In [9]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    "folk-mythology-tales-finetuned-gpt2",
    per_device_train_batch_size=1024,
    per_device_eval_batch_size=1024,
    evaluation_strategy="steps",
    eval_steps=5000,
    logging_steps=5000,
    gradient_accumulation_steps=2,
    num_train_epochs=60,
    weight_decay=0.1,
    warmup_steps=1_000,
    lr_scheduler_type="cosine",
    learning_rate=5e-3,
    save_steps=5_000,
)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("input_ids")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=lm_tales["train"],
    eval_dataset=lm_tales["test"],
)

In [None]:
trainer.train()

