# Prepare a Small Text Corpus

In [1]:
from datasets import load_dataset
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

# Load dataset (plain text)

In [2]:
dataset = load_dataset(
    "text",
    data_files={"train":"/workspaces/codespaces-jupyter/data/Data.txt"}
)

# Load tokenizer & model

In [3]:
tok = GPT2Tokenizer.from_pretrained("gpt2")
tok.pad_token = tok.eos_token

model = GPT2LMHeadModel.from_pretrained("gpt2")

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


# Tokenize dataset

In [4]:
def tokenize_function(ex):
    return tok(
        ex["text"],
        truncation=True,
        max_length = 128
    )

In [5]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

#  Data collator (CLM)

In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tok,
    mlm= False 
)

#  Training arguments

In [7]:
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    learning_rate=5e-5,
    logging_steps=10,
    save_steps=500,
    save_total_limit=1,
    fp16=False
)

# Trainer

In [9]:
trainer = Trainer(
    model=model,
    args = training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator

)

# Train

In [10]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,2.69716


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=12, training_loss=2.496250589688619, metrics={'train_runtime': 19.6764, 'train_samples_per_second': 1.118, 'train_steps_per_second': 0.61, 'total_flos': 97474176000.0, 'train_loss': 2.496250589688619, 'epoch': 2.0})

# Save model

In [12]:
trainer.save_model("./gpt2-finetuned")
tok.save_pretrained("./gpt2-finetuned")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('./gpt2-finetuned/tokenizer_config.json', './gpt2-finetuned/tokenizer.json')

# Generate text


In [13]:
prompt = "User: I want to cancel my order\nAssistant:"


In [14]:
inputs = tok(prompt, return_tensors="pt")
outputs = model.generate(
    **inputs,
    max_length=80,
    do_sample=True,
    top_k=50,
    top_p=0.95
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [16]:
print(tok.decode(outputs[0], skip_special_tokens=True))

User: I want to cancel my order
Assistant: Your order has been cancelled.

Assistant: Your order is valid. You will not receive any other e-mail confirmation. Thank you.

Assistant: No additional details received. Thank you for your order.

Assistant: Good. Thank you for your order.

Account has been created. We apologize for any delay you
