In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, EncoderDecoderModel, GPT2Tokenizer, BertTokenizer, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline
from sacrebleu import corpus_bleu

In [None]:
# Load the dataset into a dataframe

file_path="arawebnlg2020.csv"
categories=['SportsTeam', 'CelestialBody', 'Food', 'Artist', 'University', 'Politician']

df = pd.read_csv(file_path)
df = df.loc[df['category'].isin(categories)]
df = df.dropna()
df = df[['input_text_ar', 'target_text_ar', 'data']]
print(f"Found {len(df)} entries.")
print(df.head())

In [4]:
# Split the dataset into training, development (validation), and test sets.
train_data = Dataset.from_pandas(df[df['data'] == 'train'][['input_text_ar', 'target_text_ar']])
dev_data = Dataset.from_pandas(df[df['data'] == 'dev'][['input_text_ar', 'target_text_ar']])
test_data = Dataset.from_pandas(df[df['data'] == 'test'][['input_text_ar', 'target_text_ar']])

In [None]:
print(train_data)
print(dev_data)
print(test_data)

In [9]:
batch_size = 2
encoder_max_length=256
decoder_max_length=256
model_name_encoder="aubmindlab/bert-base-arabert"
model_name_decoder="aubmindlab/aragpt2-base"

In [None]:
encoder_tokenizer = BertTokenizer.from_pretrained(model_name_encoder)
# CLS token will work as BOS token
encoder_tokenizer.bos_token = encoder_tokenizer.cls_token
# SEP token will work as EOS token
encoder_tokenizer.eos_token = encoder_tokenizer.sep_token


# make sure GPT2 appends EOS in begin and end
def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
    outputs = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
    return outputs

GPT2Tokenizer.build_inputs_with_special_tokens = build_inputs_with_special_tokens
decoder_tokenizer = GPT2Tokenizer.from_pretrained(model_name_decoder)
# set pad_token_id to unk_token_id -> be careful here as unk_token_id == eos_token_id == bos_token_id
decoder_tokenizer.pad_token = decoder_tokenizer.unk_token

In [12]:
# Function to tokenize and process data for model input
def process_data_to_model_inputs(batch):
    """
    Tokenize and process the input and target text data for the encoder-decoder model, using separate tokenizers.

    Args:
        batch (dict): A batch of data with 'input_text_ar' and 'target_text_ar'.
        encoder_tokenizer (Tokenizer): The tokenizer to use for the encoder (BERT).
        decoder_tokenizer (Tokenizer): The tokenizer to use for the decoder (GPT).
        encoder_max_length (int): Maximum token length for the encoder input.
        decoder_max_length (int): Maximum token length for the decoder output.

    Returns:
        dict: A batch with processed input_ids, attention_mask, decoder_input_ids, and labels.
    """
    # Tokenize inputs using BERT tokenizer (for encoder)
    inputs = encoder_tokenizer(batch["input_text_ar"], padding="max_length", truncation=True, max_length=encoder_max_length)

    # Tokenize outputs using GPT tokenizer (for decoder)
    outputs = decoder_tokenizer(batch["target_text_ar"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["labels"] = outputs.input_ids.copy()
    batch["decoder_attention_mask"] = outputs.attention_mask


    # Mask padding tokens for loss calculation
    batch["labels"] = [
        [-100 if token == decoder_tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]
    ]
    batch["decoder_attention_mask"] = outputs.attention_mask
    
    return batch

In [None]:
train_data = train_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=['input_text_ar', 'target_text_ar'],
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

dev_data = dev_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=['input_text_ar', 'target_text_ar'],
)
dev_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

test_data = test_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=['input_text_ar', 'target_text_ar'],
)
test_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

In [14]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = decoder_tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = decoder_tokenizer.eos_token_id
    label_str = decoder_tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    return {"bleu": round(corpus_bleu(pred_str , [label_str]).score, 4),
            "bleu_1": round(corpus_bleu(pred_str , [label_str]).precisions[0], 4),
            "bleu_2": round(corpus_bleu(pred_str , [label_str]).precisions[1], 4),
            "bleu_3": round(corpus_bleu(pred_str , [label_str]).precisions[2], 4),
            "bleu_4": round(corpus_bleu(pred_str , [label_str]).precisions[3], 4)
    }

In [None]:
# Create an Encoder-Decoder model based on the specified architecture (BERT2BERT, BERT2GPT, GPT2GPT)
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name_encoder, model_name_decoder, tie_encoder_decoder=False)

# set decoding params
model.decoder.config.use_cache = False
model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
model.config.eos_token_id = decoder_tokenizer.eos_token_id
model.config.max_length = 256
model.config.no_repeat_ngram_size = 3
model.early_stopping = False
model.num_beams = 4

In [None]:
# Set training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir = '/output/dir',
    predict_with_generate=True,
    evaluation_strategy ="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True,
    logging_steps=100,
    logging_dir = '/log/dir',
    save_steps= 1000,
    eval_steps=1000,
    do_train=True,
    do_eval=True,
    num_train_epochs = 5,
    overwrite_output_dir=True
)

In [None]:
# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=dev_data,
    compute_metrics=compute_metrics # Pass decoder tokenizer for predictions
)

In [None]:
# Train the model
trainer.train()

In [None]:
# Evaluate on test data
trainer.predict(test_data)

In [None]:
# Generate samples
text_generation = pipeline("text2text-generation", model='checkpoint')
input_text = ''
output_text = text_generation(
    input_text,
    num_beams=5,
    max_length=256,
    top_p=0.9,
    repetition_penalty = 3.0,
    no_repeat_ngram_size = 3
)[0]['generated_text']
print(output_text)