In [None]:
from datasets import Dataset, load_dataset, load_metric
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
import re
from tqdm import tqdm
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np
import torch
import nltk
nltk.download('punkt')

In [None]:
# Downloading the dataset

dataset_gujarati = load_dataset("csebuetnlp/xlsum", "gujarati")
dataset_hindi = load_dataset("csebuetnlp/xlsum","hindi")
dataset_telugu = load_dataset("csebuetnlp/xlsum","telugu")

In [None]:
# Extracting the text and summary from the hindi dataset

train_text_hi = dataset_hindi['train']['text'][:5000]
train_summary_hi = dataset_hindi['train']['summary'][:5000]

test_text_hi = dataset_hindi['test']['text'][:1200]
test_summary_hi = dataset_hindi['test']['summary'][:1200]

eval_text_hi = dataset_hindi['validation']['text'][:1200]
eval_summary_hi = dataset_hindi['validation']['summary'][:1200]

In [None]:
# Extracting the text and summary from the gujarati dataset

train_text_hi = dataset_hindi['train']['text'][:5000]
train_summary_hi = dataset_hindi['train']['summary'][:5000]

test_text_hi = dataset_hindi['test']['text'][:1200]
test_summary_hi = dataset_hindi['test']['summary'][:1200]

eval_text_hi = dataset_hindi['validation']['text'][:1200]
eval_summary_hi = dataset_hindi['validation']['summary'][:1200]

In [None]:
# Extracting the text and summary from the telugu dataset

train_text_te = dataset_telugu['train']['text']
train_summary_te = dataset_telugu['train']['summary']

test_text_te = dataset_telugu['test']['text']
test_summary_te = dataset_telugu['test']['summary']

eval_text_te = dataset_telugu['validation']['text']
eval_summary_te = dataset_telugu['validation']['summary']

In [None]:
#Functions to clean and normalize the dataset

def clean_gujarati_text(text):
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer("gu")
    cleaned_text = re.sub(r'[^\u0A80-\u0AFF0-9\s]', '', text)
    cleaned_text = normalizer.normalize(cleaned_text)

    return cleaned_text

def clean_hindi_text(text):
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer("hi")
    cleaned_text = re.sub(r'[^\u0900-\u097F0-9\s]', '', text)
    cleaned_text = normalizer.normalize(cleaned_text)

    return cleaned_text

def clean_telugu_text(text):
    factory = IndicNormalizerFactory()
    normalizer = factory.get_normalizer("te")
    cleaned_text = re.sub(r'[^\u0C00-\u0C7F0-9\s]', '', text)
    cleaned_text = normalizer.normalize(cleaned_text)
    
    return cleaned_text

In [None]:
# Cleaning the hindi dataset

cleaned_train_text_hi = [clean_hindi_text(text) for text in train_text_hi]
cleaned_train_summary_hi = [clean_hindi_text(text) for text in train_summary_hi]

cleaned_test_text_hi = [clean_hindi_text(text) for text in test_text_hi]
cleaned_test_summary_hi = [clean_hindi_text(text) for text in test_summary_hi]

cleaned_eval_text_hi = [clean_hindi_text(text) for text in eval_text_hi]
cleaned_eval_summary_hi = [clean_hindi_text(text) for text in eval_summary_hi]

In [None]:
# Cleaning the gujarat dataset

cleaned_train_text_gj = [clean_gujarati_text(text) for text in train_text_gj]
cleaned_train_summary_gj = [clean_gujarati_text(text) for text in train_summary_gj]

cleaned_test_text_gj = [clean_gujarati_text(text) for text in test_text_gj]
cleaned_test_summary_gj = [clean_gujarati_text(text) for text in test_summary_gj]

cleaned_eval_text_gj = [clean_gujarati_text(text) for text in eval_text_gj]
cleaned_eval_summary_gj = [clean_gujarati_text(text) for text in eval_summary_gj]

In [None]:
# Cleaning the telugu dataset

cleaned_train_text_te = [clean_telugu_text(text) for text in train_text_te]
cleaned_train_summary_te = [clean_telugu_text(text) for text in train_summary_te]

cleaned_test_text_te = [clean_telugu_text(text) for text in test_text_te]
cleaned_test_summary_te = [clean_telugu_text(text) for text in test_summary_te]

cleaned_eval_text_te = [clean_telugu_text(text) for text in eval_text_te]
cleaned_eval_summary_te = [clean_telugu_text(text) for text in eval_summary_te]

In [None]:
# Merging the cleaned data

train_text = cleaned_train_text_hi + cleaned_train_text_gj + cleaned_train_text_te
train_summary = cleaned_train_summary_hi + cleaned_train_summary_gj + cleaned_train_summary_te

test_text = cleaned_test_text_hi + cleaned_test_text_gj + cleaned_test_text_te
test_summary = cleaned_test_summary_hi + cleaned_test_summary_gj + cleaned_test_summary_te

eval_text = cleaned_eval_text_hi + cleaned_eval_text_gj + cleaned_eval_text_te
eval_summary = cleaned_eval_summary_hi + cleaned_eval_summary_gj + cleaned_eval_summary_te

In [None]:
# Creating datasets from the merged data

train_dataset = Dataset.from_dict({"text": train_text, "summary": train_summary})
test_dataset = Dataset.from_dict({"text": test_text, "summary": test_summary})
eval_dataset = Dataset.from_dict({"text": eval_text, "summary": eval_summary})

In [None]:
# Define the model and tokenizer

model_checkpoint = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
# Function for preprocessing and tokenizing the data

prefix = "summarize: "
max_input_length = 512
max_target_length = 64

def preprocess_data(data):
  texts_cleaned = data["text"]
  inputs = [prefix + text for text in texts_cleaned]
  model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(data["summary"], max_length=max_target_length,
                       truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [None]:
# Preprocessing and tokemizating the data

tokenized_train_dataset = train_dataset.map(preprocess_data,
                                            batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_data,
                                          batched=True)
tokenized_eval_dataset = eval_dataset.map(preprocess_data,
                                          batched=True)

In [None]:
# Preparing the model arguements

batch_size = 2
model_name = "t5-small-multilungual-summarization"
model_dir = "output_dir"

args = Seq2SeqTrainingArguments(
    model_dir,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_strategy="steps",
    logging_steps=100,
    save_strategy="steps",
    save_steps=200,
    learning_rate=4e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    predict_with_generate=True,
    fp16=False,
    load_best_model_at_end=True,
    metric_for_best_model="rouge1",
    report_to="tensorboard",
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)
metric = load_metric("rouge")

In [None]:
# Defining finction for computing metrics

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip()))
                      for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
# Preparing the trainer to train the model

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

trainer = Seq2SeqTrainer(
    # model_init=model_init,
    model=model,
    args=args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
trainer.args.device = device 

In [None]:
# Train the model

trainer.train()