In [None]:
!pip install transformers==4.30
!pip install accelerate -U
!pip install rouge_score
!pip install datasets
!pip install evaluate
!pip install sacremoses

In [None]:
import pandas as pd
import torch
import pickle
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, MT5ForConditionalGeneration, MT5Tokenizer
from datasets import Dataset
import numpy as np
import nltk
import evaluate
rouge_score = evaluate.load("rouge")
nltk.download("punkt")
from transformers import DataCollatorForSeq2Seq
from nltk.tokenize import sent_tokenize

In [None]:
!pip install indic-nlp-library

In [None]:
!pip install datasets transformers accelerate evaluate sentencepiece

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from transformers import MBartForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
model = "google/mt5-base"
tokenizer = AutoTokenizer.from_pretrained(model, 
                                               do_lower_case=False, 
                                               use_fast=False, 
                                               keep_accents=True)

model = AutoModelForSeq2SeqLM.from_pretrained(model).to(device)


In [None]:
bos_id = tokenizer._convert_token_to_id_with_added_voc("<s>")
eos_id = tokenizer._convert_token_to_id_with_added_voc("</s>")
pad_id = tokenizer._convert_token_to_id_with_added_voc("<pad>")


In [None]:
train_path="/kaggle/input/mt5-bm25/hindi_train_final_BM25.csv"
val_path="/kaggle/input/mt5-bm25/hindi_val_final_BM25.csv"
test_path="/kaggle/input/mt5-bm25/hindi_test_final_BM25.csv"
df_train=pd.read_csv(train_path)[["Article", "Summary"]]
df_val=pd.read_csv(val_path)[["Article", "Summary"]]
df_test=pd.read_csv(test_path)[["Article", "Summary"]]

In [None]:
df_train = df_train.dropna()
df_val = df_val.dropna()
df_test = df_test.dropna()

In [None]:
train_data = Dataset.from_pandas(df_train)
val_data = Dataset.from_pandas(df_val)
test_data = Dataset.from_pandas(df_test)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
def tokenize_helper(dat):
    print("Article:", type(dat["Article"]))
    print("Summary:", type(dat["Summary"]))
    
    model_inputs = tokenizer(
        dat["Article"],
        truncation=True,
        max_length=512  # Maximum length for articles
    )
    true_sent = tokenizer(
        dat["Summary"],
        truncation=True,
        max_length=64  # Maximum length for summaries
    )
    model_inputs["labels"] = true_sent["input_ids"]
    return model_inputs



In [None]:
def add_words(example):
    # Modify the column 'column_name' by adding words to each entry
    example['input'] = [entry + "  </s> <2hi>" for entry in example['Article']]
    example['target'] = ["<2hi> " + entry + " </s>" for entry in example['Summary']]
    return example


In [None]:
train_data=train_data.map(add_words, batched=True)
val_data=val_data.map(add_words, batched=True)

In [None]:
tokenized_train = train_data.map(tokenize_helper, batched=True)
tokenized_val = val_data.map(tokenize_helper, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model = model)
tokenized_train = tokenized_train.remove_columns(train_data.column_names)
tokenized_val = tokenized_val.remove_columns(val_data.column_names)

In [None]:
temp = tokenizer(test_data["Article"][0], truncation = True, padding=True, return_tensors='pt').input_ids.to(device)
out = model.generate(temp, max_length=150, num_beams=4, early_stopping=True) 
decoded_output = tokenizer.decode(out[0], 
                                    skip_special_tokens=True, 
                                    clean_up_tokenization_spaces=False)

print(decoded_output)


In [None]:
test_data["Article"][0]

In [None]:
batch_size = 4
epochs = 5
args = Seq2SeqTrainingArguments(
    output_dir = "/kaggle/working/",
    evaluation_strategy = "epoch",
    save_total_limit=1,
    learning_rate = 1e-3,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    predict_with_generate=True,
    num_train_epochs = epochs,
    report_to="none"
)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
# !rm -rf /kaggle/working/*

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
print("model evaluation done")

In [None]:
from tqdm import tqdm

actual = []
pred = []

for i in tqdm(range(len(test_data)), desc="Generating summaries"):
    actual.append(test_data["Article"][i])
    temp = tokenizer(test_data["Article"][i], truncation=True, padding=True, return_tensors='pt').input_ids.to(device)
    out = model.generate(temp, max_length=150, num_beams=4, early_stopping=True) 
    decoded_output = tokenizer.decode(out[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
    pred.append(decoded_output)

In [None]:
with open("/kaggle/working/pred", 'wb') as file:
    # Dump the list into the file using pickle
    pickle.dump(pred, file)
    
    
print("predictions")

In [None]:
with open("/kaggle/working/actual", 'wb') as file:
    # Dump the list into the file using pickle
    pickle.dump(actual, file)
    
    
print("actuals")

In [None]:
# import os

# file_path = ""

# # Check if the file exists before deleting
# if os.path.exists(file_path):
#     os.remove(file_path)
#     print(f"File {file_path} deleted successfully.")
# else:
#     print(f"File {file_path} does not exist.")


In [None]:
!zip -r fine_tuned_model_mT5_1500.zip /kaggle/working/checkpoint-23500