In [None]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Create a folder in the root directory
!mkdir -p "/content/drive/My Drive/FinalTextSummarization"

Mounted at /content/drive


In [None]:
# Installing all the required libraries
!pip install datasets --q
# !pip install -U transformers --q
!pip install rouge_score --q

#Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`
!pip install -U transformers[torch] --q
!pip install accelerate -U --q

import pandas as pd
from datasets import Dataset


from datasets import load_dataset
from transformers import BertTokenizer, BertLMHeadModel
from transformers import DataCollatorWithPadding, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback
from rouge_score import rouge_scorer

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires pyarrow<15.0.0a0,>=14.0.1, but you have pyarrow 16.1.0 w

In [None]:
tokenizer = BertTokenizer.from_pretrained('Shushant/nepaliBERT')

In [None]:
# Load the train dataset from CSV
train_df = pd.read_csv('/content/drive/My Drive/TextSummarization/final_dataset_train.csv')
train_dataset = Dataset.from_pandas(train_df)

# Load the validation dataset from CSV
val_df = pd.read_csv('/content/drive/My Drive/TextSummarization/final_dataset_validation.csv')
val_dataset = Dataset.from_pandas(val_df)

# Load the test dataset from CSV
test_df = pd.read_csv('/content/drive/My Drive/TextSummarization/final_dataset_test.csv')
test_dataset = Dataset.from_pandas(test_df)

In [None]:
# Preprocess the data for fine-tuning
def preprocess_function(examples):
    inputs = examples["text"]
    targets = examples["summary"]

    #Tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    #Tokenize the targets
    with tokenizer.as_target_tokenizer():
      labels = tokenizer(targets, max_length=512, truncation=True, padding="max_length", return_tensors="pt")

    model_inputs["labels"] = labels.input_ids

    return model_inputs

# Create DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding="max_length", return_tensors="pt")

# # Map preprocess_function to dataset


In [None]:
%%time
## train_dataset & val_dataset both are raw.
## have to tokenize them before you can feed it to trainer (in the code block below)
tokenized_train = train_dataset.map(preprocess_function, batched=True, batch_size=10,  remove_columns=test_dataset.column_names)
tokenized_val = val_dataset.map(preprocess_function, batched=True, batch_size=10,  remove_columns=test_dataset.column_names)

In [None]:
# Modify the Model Architecture
class NepaliBERTSummarizer(nn.Module):
    def __init__(self):
        super(NepaliBERTSummarizer, self).__init__()
        self.bert = BertModel.from_pretrained('Shushant/nepaliBERT')
        self.linear = nn.Linear(self.bert.config.hidden_size, tokenizer.vocab_size)  # Adjust output size as needed

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        summary_logits = self.linear(sequence_output)
        return summary_logits

model = NepaliBERTSummarizer()
# Initialize the model
model = NepaliBERTSummarizer()

In [None]:
# Fine-tune the BertLMHeadModel for text summarization
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/drive/My Drive/FinalFinal/NepBERTa-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=15,
    weight_decay=0.01,
    save_total_limit=5,
    push_to_hub=False,
    load_best_model_at_end=True
)

#Stopping training if validation loss doesn't improve for 3 epochs
early_stopping = EarlyStoppingCallback(early_stopping_patience = 3)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    callbacks=[early_stopping]      #Add the early stopping callback
)

# Get the first batch of the training dataloader
for batch in trainer.get_train_dataloader():
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"]

    # Pass the input through the model to get the output
    #outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

    # Print the input and output sizes
    print("Input size:", input_ids.size())
    print("Attention Mask size:", attention_mask.size())
    print("Labels size:", labels.size())
    #print("Output size:", outputs.size())

    break  # Break after processing the first batch

# Continue with trainer.train() to start the training process
trainer.train()

# Save the fine-tuned model
model.save_pretrained("model")
tokenizer.save_pretrained("tokenizer")

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.


Input size: torch.Size([4, 512])
Attention Mask size: torch.Size([4, 512])
Labels size: torch.Size([4, 512])


Epoch,Training Loss,Validation Loss
1,0.625,0.576531
2,0.598,0.569704
3,0.5798,0.569248
4,0.5801,0.57016


Epoch,Training Loss,Validation Loss
1,0.625,0.576531
2,0.598,0.569704
3,0.5798,0.569248
4,0.5801,0.57016
5,0.5595,0.578725
6,0.5488,0.60399


There were missing keys in the checkpoint model loaded: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias'].


('/content/drive/My Drive/FinalTextSummarization/tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/FinalTextSummarization/tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/FinalTextSummarization/tokenizer/vocab.txt',
 '/content/drive/My Drive/FinalTextSummarization/tokenizer/added_tokens.json')

In [None]:
# Load the fine-tuned model
model = BertLMHeadModel.from_pretrained("model", num_labels=2, ignore_mismatched_sizes=True)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained("tokenizer")
tokenizer

BertTokenizer(name_or_path='/content/drive/My Drive/FinalTextSummarization/tokenizer', vocab_size=30523, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
from rouge_score import rouge_scorer
from tqdm.auto import tqdm

# Evaluate the fine-tuned model using ROUGE
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

references = test_dataset["target"]
predictions = []

####=======================================================================================================
####=======================================================================================================
## NOTE: while looping through dataset always use index first and then "key". e.g test_dataset[i]["target"]
######### never use it like this test_dataset["target"][i]; as this is slow and memory intensive;
####=======================================================================================================
## NOTE: don't use padding='max_length' in tokenizer when testing - That's why there was error
######## BERT's max token limit is 512, since BERT is not a generator model,
########## model couldn't generate any more token when the input length is already 512
####=======================================================================================================
####=======================================================================================================
for i in tqdm(range(test_dataset.num_rows)):
  inputs = tokenizer(test_dataset[i]["text"], return_tensors="pt", max_length=400, truncation=True).input_ids
  # print(inputs.shape)
  #inputs.to('cuda:0') #if running on GPU
  output_ids = model.generate(inputs, max_new_tokens=50, num_beams=4)
  prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
  predictions.append(prediction)

rouge_scores = []

for i in range(len(predictions)):
  references[i]
  predictions[i]
  rouge_scores.append(scorer.score(references[i], predictions[i]))

print(rouge_scores)
print("ROUGE-1:", rouge_scores[0]['rouge1'].fmeasure)
print("ROUGE-2:", rouge_scores[0]['rouge2'].fmeasure)
print("ROUGE-L:", rouge_scores[0]['rougeL'].fmeasure)

  0%|          | 0/725 [00:00<?, ?it/s]

[{'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0, recall=0, fmeasure=0)}, {'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0, recall=0, fmeasure=0)}, {'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0, recall=0, fmeasure=0)}, {'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0, recall=0, fmeasure=0)}, {'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0, recall=0, fmeasure=0)}, {'rouge1': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rouge2': Score(precision=0.0, recall=0.0, fmeasure=0.0), 'rougeL': Score(precision=0, rec

In [None]:
# Generate Summary for the given long text using the fine-tuned NepBerta Model
long_text = "सहकारी सन्दर्भमा यो नवौं छानबिन समिति हो । बाँकी ८ वटा छानबिन समिति, जाँचबुझ आयोग, यसअघि बनेका छन । नेपालमा सहकारीलाई संस्थागत गर्ने सन्दर्भमा जति पनि छानबिन समिति, अध्ययन, जाँचबुझ समितिहरु बनेका छन् । ती सबैका प्रतिवेदनहरु सचिवालयमार्फत पत्रचार गरेर प्राप्त गरिसक्ने भनिएको छ',सभापति थापाले भने,'सहकारीसँग सम्बन्धित सबै आवश्यक त्यस्ता दस्तावेज, डकुमेण्ट, प्रतिवेदनहरु उपलब्ध गराउन पनि म समितिको तर्फबाट सम्बद्ध सबैसँग विनम्रतापूर्वक अनुरोध गर्न चाहन्छु । "
input_ids = tokenizer(long_text, return_tensors="pt", max_length=512, truncation=True)["input_ids"]
output_ids = model.generate(input_ids, max_new_tokens=50, num_beams=4)
summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)

output_ids.shape
print("Summary:", summary)

Summary: सहकारी सनदरभमा यो नवौ छानबिन समिति हो । बाकी ८ वटा छानबिन समिति, जाचबझ आयोग, यसअघि बनका छन । नपालमा सहकारीलाई ससथागत गरन सनदरभमा जति पनि छानबिन समिति, अधययन, जाचबझ समितिहर बनका छन । ती सबका परतिवदनहर सचिवालयमारफत पतरचार गरर परापत गरिसकन भनिएको छ ', सभापति थापाल भन,'सहकारीसग समबनधित सब आवशयक तयसता दसतावज, डकमणट, परतिवदनहर उपलबध गराउन पनि म समितिको तरफबाट समबदध सबसग विनमरतापरवक अनरोध गरन चाहनछ ।


In [None]:
# Function to summarize long text using the fine-tuned model
def summarize_text(text):
    input_ids = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)["input_ids"]
    input_ids
    output_ids = model.generate(input_ids, max_new_tokens=512, num_beams=4, num_return_sequences=1)
    summary = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    print(output_ids.shape)
    return summary

# Prompt the user to input long text
long_text = input("Enter the long text to be summarized: ")

# Generate summary using the fine-tuned model
summary = summarize_text(long_text)

print("Summary:", summary)