In [1]:
# Step 1: Install necessary libraries
!pip install transformers datasets torch
!pip install transformers[torch]

!pip install "accelerate>=0.26.0"


zsh:1: no matches found: transformers[torch]


In [2]:
# Step 2: Import libraries
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, GPT2Config, GPT2LMHeadModel, TrainingArguments, Trainer, pipeline
from datasets import Dataset
import torch
import numpy as np
import math
from itertools import chain
from sklearn.model_selection import train_test_split
import pandas as pd
import accelerate

*Data Loading*

In [3]:
# let us load our data and display them
file_path = '/teamspace/studios/this_studio/zeleznice_dataset.csv'

data = pd.read_csv(file_path)
data

Unnamed: 0,text
0,Odchod Pirátů z vládní koalice se začíná proje...
1,Společnost Škoda Group se blíží završení velké...
2,České dráhy a polský státní dopravce PKP Inter...
3,Slovenská společnost Rolling Stock Lease (RS L...
4,Evropská investiční banka (EIB) půjčí Českým d...
...,...
1027,Dieselové soupravy a jednotky na rakouské úzko...
1028,"Praha se dohodla se Správou železnic (SŽ), že ..."
1029,Chystané nové opravárenské hale Českých drah v...
1030,Železničná spoločnosť Slovensko (ZSSK) dosáhla...


**Tokenize**

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    return tokenizer(example["text"])

dataset = Dataset.from_pandas(data)
tokenized_ds = dataset.map(tokenize_function, batched=True, remove_columns=["text"])


Map:   0%|          | 0/1032 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1230 > 1024). Running this sequence through the model will result in indexing errors


In [5]:
def concatenate_and_chunk(dataset, chunk_size=512):
    all_input_ids = list(chain(*dataset["input_ids"]))
    chunks = [all_input_ids[i:i + chunk_size] for i in range(0, len(all_input_ids), chunk_size)]
    chunks = [chunk for chunk in chunks if len(chunk) == chunk_size]
    return Dataset.from_dict({"input_ids": chunks})

chunked_ds = concatenate_and_chunk(tokenized_ds)

In [6]:
split_ds = chunked_ds.train_test_split(test_size=0.1)


data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

split_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 2249
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 250
    })
})

**Model**

In [7]:
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=512,
    n_embd=768,
    n_layer=4,
    n_head=4
)

model = GPT2LMHeadModel(config)

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.tensor(labels)

    shift_labels = labels[:, 1:].reshape(-1)
    shift_logits = logits[:, :-1, :].reshape(-1, logits.shape[-1])
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)
    loss = loss_fct(shift_logits, shift_labels)
    perplexity = math.exp(loss.item())
    return {"perplexity": perplexity}

**Model train args**

In [9]:
# Set this according to size of your dataset
# You should train for at least 15 mins on A10 GPU to get something reasonable
TRAIN_EPOCHS = 5

SAVE_STEPS = 2000
EVAL_STEPS = SAVE_STEPS // 2

# training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-training",  # Directory to save the model checkpoints and other outputs
    eval_strategy="steps",  # Evaluation strategy to use during training ('steps' or 'epochs')
    eval_steps=EVAL_STEPS,  # Perform evaluation every EVAL_STEPS steps
    num_train_epochs=TRAIN_EPOCHS,  # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size for training on each device
    per_device_eval_batch_size=16,  # Batch size for evaluation on each device
    learning_rate=0.01,  # Initial learning rate for the optimizer
    lr_scheduler_type='cosine',  # Learning rate scheduler type. 'cosine' provides a cosine decay schedule.
    warmup_ratio=0.05,  # Proportion of training to perform linear learning rate warmup for
    adam_beta1=0.9,  # Beta1 parameter for the Adam optimizer (first moment decay)
    adam_beta2=0.999,  # Beta2 parameter for the Adam optimizer (second moment decay)
    weight_decay=0.01,  # Weight decay to apply (L2 regularization)
    logging_strategy="steps",  # Logging strategy to use. 'steps' logs at specified steps.
    logging_steps=EVAL_STEPS,  # Log training metrics every EVAL_STEPS steps
    save_steps=SAVE_STEPS,  # Save a checkpoint every SAVE_STEPS steps
    save_total_limit=10,  # Maximum number of checkpoints to keep. Older checkpoints are deleted.
    # report_to='wandb',  # Uncomment to report metrics to Weights and Biases (optional)
)

trainer = Trainer(model=model,
                 args = training_args,
                 tokenizer=tokenizer,
                 train_dataset=split_ds["train"],
                 eval_dataset=split_ds["test"],
                 compute_metrics=compute_metrics,
                 data_collator = data_collator)

  trainer = Trainer(model=model,


*Let us train the model*

In [10]:
trainer.train()

Step,Training Loss,Validation Loss


TrainOutput(global_step=705, training_loss=5.17635818373227, metrics={'train_runtime': 212.4708, 'train_samples_per_second': 52.925, 'train_steps_per_second': 3.318, 'total_flos': 979445006991360.0, 'train_loss': 5.17635818373227, 'epoch': 5.0})

**Generator**

In [11]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)
PROMPT = "Vlak jede"  # Customize prompt as needed
generated_texts = generator(PROMPT, max_length=50, do_sample=True, temperature=0.5, repetition_penalty=1.9)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [12]:
emotion_model = pipeline("text-classification", model="Toshifumi/bert-base-multilingual-cased-finetuned-emotion")
emotion_results = emotion_model(generated_texts[0]["generated_text"])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [13]:
print("Generated Text:")
print(generated_texts[0]["generated_text"])
print("\nEmotion Analysis:")
print(emotion_results)

Generated Text:
Vlak jede.�řou kolení ujuchm zom vÚ p�á dopravďi žovatrké trozledzů�okečas

Emotion Analysis:
[{'label': 'LABEL_1', 'score': 0.7141880393028259}]
