In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
from evaluate import load
import re
import pandas as pd

LOAD DATASET

In [None]:
splits = {
    'train': 'data/train-00000-of-00001-87e767a83d108945.parquet',
    'validation': 'data/validation-00000-of-00001-2d1ce84ca498cf0b.parquet'
}
dataset = pd.read_parquet("hf://datasets/musabg/wikipedia-tr-summarization/" + splits["train"])

SAMPLE AND CLEAN DATA

In [6]:
reviews_sample = dataset.sample(n=4000, random_state=42)

reviews_sample = reviews_sample.rename(columns={"text": "article", "summary": "summary"})

def clean_text(text):
    text = re.sub(r"[^\w\s.,!?ğüşöçıİĞÜŞÖÇ]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

reviews_sample["article"] = reviews_sample["article"].apply(clean_text)

CONVERT AND SPLIT DATA

In [7]:
hf_dataset = Dataset.from_pandas(reviews_sample)

train_test_split = hf_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

MODEL AND DATA PREPROCESSING

In [None]:
model_name = "ozcangundes/mt5-small-turkish-summarization"  
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def preprocess_data(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    outputs = examples["summary"]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(outputs, max_length=200, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(preprocess_data, batched=True, remove_columns=val_dataset.column_names)

EVALUATION AND TRAINING SETUP

In [None]:
rouge = load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    
    if isinstance(result["rouge1"], dict):
        return {
            "rouge1": result["rouge1"]["fmeasure"] * 100,
            "rouge2": result["rouge2"]["fmeasure"] * 100,
            "rougeL": result["rougeL"]["fmeasure"] * 100,
        }
    return {
        "rouge1": result["rouge1"] * 100,
        "rouge2": result["rouge2"] * 100,
        "rougeL": result["rougeL"] * 100,
    }

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=2e-5,
    lr_scheduler_type="linear",
    warmup_steps=500,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=4,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    metric_for_best_model="eval_loss",
    fp16=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

TRAIN AND SAVE MODEL

In [None]:
trainer.train()
trainer.save_model("./trained_model")