In [18]:
!pip install transformers datasets torch



In [19]:
text = """
Artificial intelligence is transforming the world.
Machine learning helps computers learn from data.
Deep learning is a powerful technology.
AI is used in healthcare, business, and education.
Technology is evolving rapidly in modern society.
"""

with open("data.txt", "w") as f:
    f.write(text)

In [20]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

print("GPT-2 Loaded Successfully")

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


GPT-2 Loaded Successfully


In [21]:
from datasets import load_dataset

dataset = load_dataset("text", data_files={"train": "data.txt"})

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [22]:
from datasets import load_dataset

# Load dataset
dataset = load_dataset("text", data_files={"train": "data.txt"})

# Fix tokenizer padding issue
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=64
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [23]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Data collator (important fix)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    logging_steps=10,
    save_steps=10
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator
)

trainer.train()

  super().__init__(loader)


Step,Training Loss


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=9, training_loss=2.5190306769476996, metrics={'train_runtime': 113.9545, 'train_samples_per_second': 0.158, 'train_steps_per_second': 0.079, 'total_flos': 587907072000.0, 'train_loss': 2.5190306769476996, 'epoch': 3.0})

In [24]:
input_text = "Artificial intelligence"

inputs = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(inputs, max_length=50)

print(tokenizer.decode(output[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Artificial intelligence is a powerful technology. It is a powerful technology. It is a powerful technology. It is a powerful technology. It is a powerful technology. It is a powerful technology. It is a powerful technology. It is a powerful technology.


In [25]:
model.save_pretrained("my_gpt2_model")
tokenizer.save_pretrained("my_gpt2_model")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('my_gpt2_model/tokenizer_config.json', 'my_gpt2_model/tokenizer.json')