## ***Model Training - PEGASUS***

In [None]:
!pip install datasets evaluate rouge_score bert_score

In [None]:
#importinig the libraries
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from rouge_score import rouge_scorer
import torch

In [None]:
train_path = "/content/drive/MyDrive/MScDissertation-Sonu/data/processed/train_processed.csv"
valid_path = "/content/drive/MyDrive/MScDissertation-Sonu/data/processed/validation_processed.csv"
df_train = pd.read_csv(train_path)
df_valid = pd.read_csv(valid_path)
train_text_data = Dataset.from_pandas(df_train[['input_text', 'target_text']])
valid_text_data = Dataset.from_pandas(df_valid[['input_text', 'target_text']])

In [None]:
model_name = "google/pegasus-xsum"
tknzr = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [None]:
#tokenizing the data..
max_input_length = 512
max_target_length = 256

def prprcs_fun(examples):
  model_inputs = tknzr(examples['input_text'], max_length=max_input_length, truncation=True)
  with tknzr.as_target_tokenizer():
    labels = tknzr(examples['target_text'], max_length=max_target_length, truncation=True)
  model_inputs['labels'] = labels['input_ids']
  return model_inputs

tokenized_train = train_text_data.map(prprcs_fun, batched=True,remove_columns=train_text_data.column_names)
tokenized_valid = valid_text_data.map(prprcs_fun, batched=True,remove_columns=valid_text_data.column_names)


In [None]:
batch_size = 16
model_output_dir = "/content/drive/MyDrive/MScDissertation-Sonu/models/pegasus-xsum"

args = Seq2SeqTrainingArguments(output_dir=model_output_dir,eval_strategy="epoch",
    learning_rate=2e-5,per_device_train_batch_size=batch_size,per_device_eval_batch_size=batch_size,
    weight_decay=0.01,save_total_limit=3,num_train_epochs=10,predict_with_generate=True,logging_steps=50,)

In [None]:
rouge_metric = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def computeMtrx(eval_preds):
  preds, labels = eval_preds
  decoded_preds = tknzr.batch_decode(preds,skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tknzr.pad_token_id)
  decoded_labels = tknzr.batch_decode(labels,skip_special_tokens=True)

  rougeRslt = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels)
  bertRslt = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
  bertRslt = {"bert_score_f1":np.mean(bertRslt['f1'])*100}
  result = {**rougeRslt,**bertRslt}
  return {k: round(v,4) for k,v in result.items()}

In [None]:
#training the model...
data_collator = DataCollatorForSeq2Seq(tknzr,model=model)
trainer = Seq2SeqTrainer(model,args,train_dataset=tokenized_train,eval_dataset=tokenized_valid,data_collator=data_collator,
                         tokenizer=tknzr,compute_metrics=computeMtrx)

print("starting training....")
trainer.train()


In [None]:
final_model_path = "/content/drive/MyDrive/MScDissertation-Sonu/models/pegasus-xsum-final"
trainer.save_model(final_model_path)