In [1]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSeq2SeqLM
from datasets import load_dataset
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm
import evaluate
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

In [2]:
languages = ["telugu", "urdu", "marathi", "hindi", "tamil", "bengali", "english"]
dfs = []
for lang in languages:
    dataset = load_dataset("csebuetnlp/xlsum", lang, split="train[:2000]")
    df = dataset.to_pandas()
    dfs.append(df)

In [3]:
df = pd.concat(dfs, ignore_index=True)
print(df.shape)
print(df.head())

(14000, 5)
                       id                                                url  \
0  international-53649907  https://www.bbc.com/telugu/international-53649907   
1          india-46550604          https://www.bbc.com/telugu/india-46550604   
2          india-43404438          https://www.bbc.com/telugu/india-43404438   
3  international-54671956  https://www.bbc.com/telugu/international-54671956   
4                53723894                https://www.bbc.com/telugu/53723894   

                                               title  \
0  పాకిస్తాన్ ఎయిర్‌లైన్స్‌లో నకిలీ లైసెన్సుల పైల...   
1  తెలంగాణ ముఖ్యమంత్రిగా కేసీఆర్ రెండోసారి ప్రమాణ...   
2  ‘అధికారం కొన్ని కులాల గుప్పిట్లోనే ఉండాలా? కుద...   
3  పోలండ్‌లో కొత్త అబార్షన్ చట్టాలను వ్యతిరేకిస్త...   
4  దిల్లీ అల్లర్లపై పరస్పర విరుద్ధ నివేదికలు... ఏ...   

                                             summary  \
0  పాకిస్తాన్ విమానయాన రంగంలో కొత్త సంక్షోభం మొదల...   
1  తెలంగాణ ముఖ్యమంత్రిగా కల్వకుంట్ల చంద్రశేఖర్ రా...   
2  

In [4]:
from transformers import MT5ForConditionalGeneration, AutoTokenizer

model_id = "csebuetnlp/mT5_multilingual_XLSum"

model = MT5ForConditionalGeneration.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [5]:
bert_model = AutoModel.from_pretrained("bert-base-multilingual-cased")
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

In [6]:
def textrank_bert(text, num_sentences=3):
    sentences = text.split('.')
    inputs = bert_tokenizer(sentences, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)
    similarity_matrix = cosine_similarity(embeddings)
    nx_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    return " ".join([sentence for _, sentence in ranked_sentences[:num_sentences]])

In [7]:
class HybridSummaryDataset(Dataset):
    def __init__(self, data, tokenizer, text_max_token_len=200, summary_max_token_len=12):
        self.tokenizer = tokenizer
        self.data = data
        self.text_max_token_len = text_max_token_len
        self.summary_max_token_len = summary_max_token_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index: int):
        data_row = self.data.iloc[index]
        text = data_row['text']

        # Extractive summarization using TextRank with BERT
        extracted_summary = textrank_bert(text)

        # Tokenize extracted summary (input to MT5 model)
        text_encoding = tokenizer(
            extracted_summary,
            max_length=self.text_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        # Tokenize target summary (ground truth)
        summary_encoding = tokenizer(
            data_row['summary'],
            max_length=self.summary_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        labels = summary_encoding['input_ids']
        labels[labels == tokenizer.pad_token_id] = -100

        return dict(
            input_ids=text_encoding['input_ids'].flatten(),
            attention_mask=text_encoding['attention_mask'].flatten(),
            labels=labels.flatten(),
            decoder_attention_mask=summary_encoding['attention_mask'].flatten()
        )

In [8]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = HybridSummaryDataset(data=df_train, tokenizer=tokenizer)
test_dataset = HybridSummaryDataset(data=df_test, tokenizer=tokenizer)

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=10)
eval_dataloader = DataLoader(test_dataset, batch_size=10)

In [9]:
num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)

optimizer = AdamW(model.parameters())
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training loop
progress_bar = tqdm(range(num_training_steps))
model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)
    print(f"Epoch: {epoch + 1} -- Loss: {loss.item()}")



  0%|          | 0/11200 [00:00<?, ?it/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch: 1 -- Loss: 3.5863237380981445
Epoch: 2 -- Loss: 2.4182894229888916
Epoch: 3 -- Loss: 2.1674599647521973
Epoch: 4 -- Loss: 1.6196531057357788
Epoch: 5 -- Loss: 1.1678788661956787
Epoch: 6 -- Loss: 0.789129912853241
Epoch: 7 -- Loss: 0.4748256504535675
Epoch: 8 -- Loss: 0.4586552083492279
Epoch: 9 -- Loss: 0.2479063868522644
Epoch: 10 -- Loss: 0.2037576138973236


In [None]:
rouge_metric = evaluate.load("rouge")
bleu_metric = evaluate.load("bleu")

model.eval()
all_predictions = []
all_references = []

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_length=12)
    decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
    all_predictions.extend(decoded_preds)
    all_references.extend(decoded_labels)

rouge_score = rouge_metric.compute(predictions=all_predictions, references=all_references)
bleu_score = bleu_metric.compute(predictions=all_predictions, references=all_references)

print("ROUGE Score:", rouge_score)
print("BLEU Score:", bleu_score)