In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset

dataset = load_dataset("text", data_files = {"train": "/content/universe_dataset.txt"})
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 3000
    })
})

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(model)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [4]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

In [20]:
def tokenize_function(examples):
    outputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    outputs["labels"] = outputs["input_ids"].copy()
    return outputs

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=2,
    remove_columns=["text"]
)

Map (num_proc=2):   0%|          | 0/3000 [00:00<?, ? examples/s]

In [21]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [22]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir = "./gpt2-universe",
    learning_rate = 2e-5,
    per_device_train_batch_size = 2,
    num_train_epochs = 3,
    weight_decay = 0.01,
    save_total_limit = 2,
    logging_steps = 50,
    fp16 = True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"]
)
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,2.4034
100,0.1656
150,0.0702
200,0.0614
250,0.058
300,0.0558
350,0.0536
400,0.0502
450,0.05
500,0.0495


TrainOutput(global_step=4500, training_loss=0.061880588610967, metrics={'train_runtime': 987.3764, 'train_samples_per_second': 9.115, 'train_steps_per_second': 4.558, 'total_flos': 587907072000000.0, 'train_loss': 0.061880588610967, 'epoch': 3.0})

In [24]:
trainer.save_model("./gpt2-universe-finetuned")
tokenizer.save_pretrained("./gpt2-universe-finetuned")

('./gpt2-universe-finetuned/tokenizer_config.json',
 './gpt2-universe-finetuned/special_tokens_map.json',
 './gpt2-universe-finetuned/vocab.json',
 './gpt2-universe-finetuned/merges.txt',
 './gpt2-universe-finetuned/added_tokens.json',
 './gpt2-universe-finetuned/tokenizer.json')

In [61]:
from transformers import pipeline

generator = pipeline("text-generation", model="./gpt2-universe-finetuned", tokenizer=tokenizer)

prompt = "Milky way galaxy"
result = generator(prompt, max_length=100, num_return_sequences=1)

from IPython.display import Markdown
display(Markdown(result[0]["generated_text"]))

Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Milky way galaxy is our home galaxy, containing over 100 billion stars.