In [None]:
import os
import gc
import warnings
warnings.filterwarnings("ignore")

In [None]:
import evaluate
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import logging, AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
logging.set_verbosity_error()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gc.collect()
torch.manual_seed(42)

In [None]:
train_data = pd.read_csv('../dataset/full_train_data_summarization.csv')
validation_data = pd.read_csv('../dataset/full_validation_data_summarization.csv')
test_data = pd.read_csv('../dataset/full_test_data_summarization.csv')

In [None]:
train_data = train_data[:6000]
validation_data = validation_data[:200]
test_data = test_data[:2000]

In [None]:
model_name = ''

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
prefix = "Hãy tóm tắt ngắn gọn nội dung sau bằng tiếng Việt: "
def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples["context"]]
  model_inputs = tokenizer(inputs, max_length=4096, truncation=True)
  labels = tokenizer(text_target=examples["summarization"], max_length=1024, truncation=True)
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
new_data = DatasetDict({
    "train": Dataset.from_dict(train_data),
    "validation": Dataset.from_dict(validation_data)
})

In [None]:
tokenized_new_data = new_data.map(preprocess_function, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_name)

In [None]:
bleu = evaluate.load("bleu")

In [None]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
  bleu_scores_ngram_1 = []
  bleu_scores_ngram_2 = []
  bleu_scores_ngram_3 = []
  bleu_scores_ngram_4 = []
  bleu_scores_ngram_avg = []
  for reference_text, generated_text in zip(decoded_labels, decoded_preds):
    bleu_score_ngram_1 = sentence_bleu([reference_text], generated_text, weights=(1, 0, 0, 0))
    bleu_score_ngram_2 = sentence_bleu([reference_text], generated_text, weights=(0, 1, 0, 0))
    bleu_score_ngram_3 = sentence_bleu([reference_text], generated_text, weights=(0, 0, 1, 0))
    bleu_score_ngram_4 = sentence_bleu([reference_text], generated_text, weights=(0, 0, 0, 1))
    bleu_score_ngram_avg = sentence_bleu([reference_text], generated_text, weights=(0.25, 0.25, 0.25, 0.25))
    bleu_scores_ngram_1.append(bleu_score_ngram_1)
    bleu_scores_ngram_2.append(bleu_score_ngram_2)
    bleu_scores_ngram_3.append(bleu_score_ngram_3)
    bleu_scores_ngram_4.append(bleu_score_ngram_4)
    bleu_scores_ngram_avg.append(bleu_score_ngram_avg)

  return {
    'bleu@1': sum(bleu_scores_ngram_1) / len(bleu_scores_ngram_1),
    'bleu@2': sum(bleu_scores_ngram_2) / len(bleu_scores_ngram_2),
    'bleu@3': sum(bleu_scores_ngram_3) / len(bleu_scores_ngram_3),
    'bleu@4': sum(bleu_scores_ngram_4) / len(bleu_scores_ngram_4),
    'bleu@avg': sum(bleu_scores_ngram_avg) / len(bleu_scores_ngram_avg)
  }

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir=f"{model_name.replace('/', '_')}_model_summarization",
    learning_rate=1e-5,
    auto_find_batch_size=True,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=5,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    save_total_limit=1,
    save_strategy='epoch',
    evaluation_strategy='epoch'
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_new_data["train"],
    eval_dataset=tokenized_new_data["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
torch.cuda.empty_cache()
gc.collect()

In [None]:
trainer.train()

# Test Model Summarization

In [None]:
model_checkpoint = ''
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)  
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.to(device)
if torch.cuda.device_count() >= 2:
  model = torch.nn.DataParallel(model)

def generate_text(text):
  prefix = 'Hãy tóm tắt ngắn gọn nội dung sau bằng tiếng Việt: '
  encoding = tokenizer(prefix+text, return_tensors="pt")
  input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
  outputs = model.module.generate(
    input_ids=input_ids, attention_mask=attention_masks,
    early_stopping=False,
    max_new_tokens=1024,
    temperature=0.7,
    top_p=0.8
  )
  for output in outputs:
    line = tokenizer.decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return line

In [None]:
test_data[f'generate_{model_name.replace('/', '_')}'] = test_data['context'].apply(lambda x: generate_text(x))