# Train your own small GPT-2 model

If you want to experiment with the trained model, you can do it at `Inference API` panel of

https://huggingface.co/openai-community/gpt2?text=My+name+is+Thomas+and+my+main

Note that we are training small GPT2 model on a tiny dataset. Still We can see observe how the model improve with the number of steps and get some interesting results.

In [4]:
#import wandb  # we will talk about wandb next lecture
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer

## Prepare data

Before training, we have to tokenize the data and split them into chunks of the same size as context size of the model.

In [5]:
# Replace with your own dataset
dataset = load_dataset("TomasHalmazna/czech_traffic_urls")

# Make validation split
dataset = dataset['train'].train_test_split(test_size=0.0015)

In [4]:
# load the gpt-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

In [5]:
# tokenize the dataset
def tokenize_function(example):
    return tokenizer(text=example["text"])
tokenized_ds = dataset.map(tokenize_function, batched=True, remove_columns='text')
tokenized_ds

Map:   0%|          | 0/1030 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2546 > 1024). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1030
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 2
    })
})

In [6]:
from itertools import chain
from datasets import Dataset, DatasetDict

def concatenate_and_chunk(dataset, chunk_size=512):
    # Flatten all `input_ids` into a single list
    all_input_ids = list(chain(*dataset["input_ids"]))
    
    # Create chunks of `chunk_size`
    chunks = [all_input_ids[i:i + chunk_size] for i in range(0, len(all_input_ids), chunk_size)]
    
    # Only keep chunks that are exactly of length `chunk_size`
    chunks = [chunk for chunk in chunks if len(chunk) == chunk_size]
    
    # Create a new dataset with only the `input_ids` chunks
    return Dataset.from_dict({"input_ids": chunks})

# Apply this function to each split (train and test) in the DatasetDict
chunked_ds = DatasetDict({
    split: concatenate_and_chunk(split_ds, chunk_size=512)
    for split, split_ds in tokenized_ds.items()
})

chunked_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 2494
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 4
    })
})

In [7]:
# data collator joins chunks into batches
# see https://huggingface.co/docs/transformers/en/main_classes/data_collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

## Model

In [8]:
# Define the model configuration for the smallest GPT-2
config = GPT2Config(
    vocab_size=len(tokenizer),      # Standard GPT-2 vocab size 50257
    n_positions=512,                # Context size (512 is enough for small-scale models)
    n_embd=768,                     # Embedding size
    n_layer=12,                     # Number of transformer layers
    n_head=12,                      # Number of attention heads
)

# Initialize the model and tokenizer
model = GPT2LMHeadModel(config)

In [9]:
import torch
import math
import numpy as np

# Define the perplexity metric
def compute_metrics(eval_pred):
    # `eval_pred` is a tuple of (logits, labels)
    logits, labels = eval_pred

    # Convert logits and labels to PyTorch tensors if they are NumPy arrays
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.tensor(labels)

    # Shift labels so that tokens align for calculating loss
    shift_labels = labels[:, 1:].reshape(-1)
    shift_logits = logits[:, :-1, :].reshape(-1, logits.shape[-1])

    # Calculate the cross-entropy loss
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)  # Ignore padding tokens
    loss = loss_fct(shift_logits, shift_labels)

    # Calculate perplexity
    perplexity = math.exp(loss.item())
    return {"perplexity": perplexity}


## Training

In [10]:
# Set this according to size of your dataset
# You should train for at least 15 mins on A10 GPU to get something reasonable
TRAIN_EPOCHS = 10

SAVE_STEPS = 1000
EVAL_STEPS = SAVE_STEPS // 2

# training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-training",  # Directory to save the model checkpoints and other outputs
    eval_strategy="steps",  # Evaluation strategy to use during training ('steps' or 'epochs')
    eval_steps=EVAL_STEPS,  # Perform evaluation every EVAL_STEPS steps
    num_train_epochs=TRAIN_EPOCHS,  # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size for training on each device
    per_device_eval_batch_size=16,  # Batch size for evaluation on each device
    learning_rate=2.5e-4,  # Initial learning rate for the optimizer
    lr_scheduler_type='cosine',  # Learning rate scheduler type. 'cosine' provides a cosine decay schedule.
    warmup_ratio=0.05,  # Proportion of training to perform linear learning rate warmup for
    adam_beta1=0.9,  # Beta1 parameter for the Adam optimizer (first moment decay)
    adam_beta2=0.999,  # Beta2 parameter for the Adam optimizer (second moment decay)
    weight_decay=0.01,  # Weight decay to apply (L2 regularization)
    logging_strategy="steps",  # Logging strategy to use. 'steps' logs at specified steps.
    logging_steps=EVAL_STEPS,  # Log training metrics every EVAL_STEPS steps
    save_steps=SAVE_STEPS,  # Save a checkpoint every SAVE_STEPS steps
    save_total_limit=10,  # Maximum number of checkpoints to keep. Older checkpoints are deleted.
    # report_to='wandb',  # Uncomment to report metrics to Weights and Biases (optional)
)

trainer = Trainer(model=model,
                 args = training_args,
                 tokenizer=tokenizer,
                 train_dataset=chunked_ds["train"],
                 eval_dataset=chunked_ds["test"],
                 compute_metrics=compute_metrics,
                 data_collator = data_collator)


  trainer = Trainer(model=model,


In [11]:
trainer.train()

Step,Training Loss,Validation Loss,Perplexity
500,3.9232,2.7032,14.92719
1000,2.4921,2.104363,8.201702
1500,1.938,2.00544,7.429206


TrainOutput(global_step=1560, training_loss=2.74758233779516, metrics={'train_runtime': 543.0611, 'train_samples_per_second': 45.925, 'train_steps_per_second': 2.873, 'total_flos': 6516623278080000.0, 'train_loss': 2.74758233779516, 'epoch': 10.0})

In [12]:
trainer.save_model("./gpt2-small-final") 

*Upload to HuggingFace*

In [13]:
YOUR_MODEL_NAME = "my_small_gpt2_trains" # change this
HF_TOKEN = ""  # change this 

model.push_to_hub(YOUR_MODEL_NAME, token=HF_TOKEN)
tokenizer.push_to_hub(YOUR_MODEL_NAME, token=HF_TOKEN)

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/TomasHalmazna/my_small_gpt2_trains/commit/2b0b0e2139499dc79ce52c7005aaca367730e40d', commit_message='Upload tokenizer', commit_description='', oid='2b0b0e2139499dc79ce52c7005aaca367730e40d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/TomasHalmazna/my_small_gpt2_trains', endpoint='https://huggingface.co', repo_type='model', repo_id='TomasHalmazna/my_small_gpt2_trains'), pr_revision=None, pr_num=None)

## Evaluation

Now you can switch from GPU to CPU. Try to complete some prompt specific to your dataset.

Does it make sense? Is it at least in Czech/Slovak?

In [6]:
from transformers import  GPT2LMHeadModel, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

In [2]:
model =  GPT2LMHeadModel.from_pretrained("./gpt2-small-final")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [12]:
PROMPT = "Vlak jede" # Set starting prompt, something specific for your dataset

generator(
    PROMPT,
    max_length=50,       # Maximum length of the generated text
    do_sample=True,
    temperature=0.5,         # Experiment with this
    repetition_penalty=1.9,  # Experiment with this
)

[{'generated_text': 'Vlak jede z Prahy do Brna a v úseku Ostrava-Svinov – Bohumín. Do této trati je přes Vsetínský nad'}]

Now go back to your training folder `.gpt2-training/`. Each `checkpoint-N` folder contains the model saved after N steps. 

If you experiment with the older models, you should see that the models improves with time.

In [4]:
def get_sample_after_N_steps(N, prompt, **kwargs):
    model =  GPT2LMHeadModel.from_pretrained(f"./gpt2-training/checkpoint-{N}/")
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

    output = generator(prompt, **kwargs)
    return output  

In [13]:
get_sample_after_N_steps(1000, "Koleje", do_sample=True, temperature=0.5)

[{'generated_text': 'Kolejeně. Důvodem je také př'}]

**Summary of Our Model**

- Our model is far from perfect. While the results it produces are in Czech and sometimes return a grammatically correct sentence, they usually do not make meaningful sense.
- The issue lies in the limited word database it is based on and the small number of epochs. Unfortunately, we didn't have enough memory to train the model for more epochs.

**HF Pre-trained models**
- let us now take some model from the HuggingFace hub and use it to our data

In [4]:
type([dataset["train"][i]['text'] for i in range(500)])

list

In [5]:
!pip install simpletransformers



In [8]:
model_args= {
            "num_train_epochs": 15,
            "learning_rate": 1e-5,
            "max_seq_length": 512,
            "silent": True
            }
model = ClassificationModel(
    "xlmroberta", "classla/xlm-roberta-base-multilingual-text-genre-classifier", use_cuda=True,
    args=model_args
    
)
predictions, logit_output = model.predict([[dataset["train"][i]['text'] for i in range(2)]]
                                        )
predictions
# Output: array([3, 8])

[model.config.id2label[i] for i in predictions]
# Output: ['Instruction', 'Promotion']


['News']

In [14]:
from simpletransformers.classification import ClassificationModel

In [13]:
import pandas as pd

# Define model arguments
model_args = {
    "num_train_epochs": 15,
    "learning_rate": 1e-5,
    "max_seq_length": 512,
    "silent": True
}

# Initialize the model
model = ClassificationModel(
    "xlmroberta", "classla/xlm-roberta-base-multilingual-text-genre-classifier", use_cuda=True,
    args=model_args
)

# Create an empty list to store the results
results = []

# Loop over each observation in the dataset and make predictions
for i in range(100):
    # Get the text for the current observation
    text = dataset["train"][i]['text']
    
    # Make a prediction for the current observation
    predictions, _ = model.predict([text])
    
    # Get the label for the prediction
    label = model.config.id2label[predictions[0]]
    
    # Append the result to the list
    results.append({"text": text, "predicted_label": label})

# Convert results to a DataFrame
output = pd.DataFrame(results)

# Display or save the output dataset
output.head()




Unnamed: 0,text,predicted_label
0,Zastupitelé Ústeckého kraje včera udělalil prv...,News
1,Ministerstva dopravy a životního prostředí fin...,News
2,Stavba posledního úseku D1 u Přerova přinesla ...,News
3,Památkově chráněné Hranické viadukty čeká reko...,News
4,České dráhy završily podpisem smlouvy se stave...,News


We can see that the most ou our sentences are "news". We can save the output and check whether there is some other genre.

In [24]:
# Save labes as csv
output[['predicted_label']].to_csv("output_labels.csv", index=False, header=False, encoding="utf-8")


In [23]:
label_counts = output['predicted_label'].value_counts().reset_index()
label_counts.columns = ['label', 'count']
label_counts


Unnamed: 0,label,count
0,News,96
1,Other,2
2,Opinion/Argumentation,1
3,Information/Explanation,1


**Conclusion**

- As conclusion we can say there are more genres than "News" in our dataset. So we have proved (by model) that on the information web (zdopravy.cz) there occur mainly news :-)
- We find the above model very interesting and we would like to know how it works and analyses the given data.