In [1]:
from datasets import Dataset
import pandas as pd

import torch
from transformers import pipeline

uid = "user431421166"
context_length = 128
batch_size_tokenizer = 2048

df = pd.DataFrame(list(pd.read_json("input/gruppo_chiuso.json")["messages"]))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
cleaned_messages = []

for message in df.query("from_id == @uid")["text_entities"]:
    cleaned_message = ""
    for entity in message:
        if entity["type"] not in ["link", "text_link", "bot_command"] and "Spoiler" not in entity["text"]:
            cleaned_message += entity["text"]
    
    if cleaned_message != "":
        cleaned_messages.append({"uid": uid, "content": cleaned_message})

raw_datasets = Dataset.from_list(cleaned_messages).train_test_split(test_size=0.2)

In [3]:
from functools import reduce
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained("GroNLP/gpt2-small-italian", padding_side="left")
model = AutoModelWithLMHead.from_pretrained("GroNLP/gpt2-small-italian")

model_size = sum(t.numel() for t in model.parameters())
print(f"Model size: {model_size/1000**2:.1f}M parameters")


def tokenize(element):
    outputs = tokenizer(
        element["content"],
        truncation=False,
        return_length=True,
    )
    joined_sequence = reduce(lambda a, b: a + [tokenizer.eos_token_id] + b, outputs["input_ids"])
    n_batches = len(joined_sequence) // context_length
    input_batch = [joined_sequence[i*context_length: (i+1)*context_length] for i in range(n_batches)]
        
    return {"input_ids": input_batch}


tokenized_datasets = raw_datasets.map(
    tokenize, batched=True, remove_columns=raw_datasets["train"].column_names, batch_size=batch_size_tokenizer
)



Model size: 108.9M parameters


Map: 100%|██████████| 12459/12459 [00:00<00:00, 23789.55 examples/s]
Map: 100%|██████████| 3115/3115 [00:00<00:00, 26783.06 examples/s]


In [4]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [5]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="model_checkpoints",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    evaluation_strategy="steps",
    eval_steps=10,
    logging_steps=10,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    weight_decay=0.1,
    warmup_steps=10,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=10,
    fp16=False,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
)

In [6]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
10,5.2449,4.868981
20,4.8935,4.677753
30,4.7255,4.592925
40,4.6764,4.540985


TrainOutput(global_step=46, training_loss=4.852084201315175, metrics={'train_runtime': 447.5274, 'train_samples_per_second': 3.271, 'train_steps_per_second': 0.103, 'total_flos': 95632883712000.0, 'train_loss': 4.852084201315175, 'epoch': 1.0})

In [28]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
pipe = pipeline(
    "text-generation", model=model, device=device, tokenizer=tokenizer
)

In [31]:
txt = ""
pipe(txt, num_return_sequences=1)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


[{'generated_text': "ahahhaha 😍😅😏\n@gastabot.it @Daniele_Zoccheddu mi raccomando a te, ma tu sei sicuro che questo è il mio programma preferito e non ti preoccupare più di niente se l'ho messo in rete per la prima volta su internet? Se invece vuoi un po' fammi sapere cosa sta succedendo con i dati della tua vita! (però io sono una bella persona"}]

In [32]:
pipe.save_pretrained("model_dir")

In [None]:
pipeline("text-generation", "model_dir")