1) mounting drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


2) install libraries

In [2]:
!pip install scikit-learn transformers torch tqdm rouge-score bert-score matplotlib

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12

3) install libraries needed

In [3]:
import os
import pickle
from multiprocessing import Pool
import random
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, get_scheduler, BatchEncoding
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torch.optim import AdamW
from tqdm import tqdm
from rouge_score import rouge_scorer
from bert_score import score as bert_score
import matplotlib.pyplot as plt

4) device setup

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device using:", device)

Device using: cuda


5) dataset paths

In [None]:
judgment_dir = "/content/drive/MyDrive/talqs dataset/summarize dataset/input judgements"
summary_dir = "/content/drive/MyDrive/talqs dataset/summarize dataset/actual summaries"

print("Summarize model dataset")
print("Total input judgement files:", len(os.listdir(judgment_dir)))
print("Total actual summaries files:", len(os.listdir(summary_dir)))

Summarize model dataset


6) dataset manipulation & splitting

In [None]:
filenames = sorted(os.listdir(judgment_dir))[:4000]

random.seed(42)
train_files, test_files = train_test_split(filenames, test_size=0.3, random_state=42)

7) reading data as chunks

In [None]:
def read_file(path):
    with open(path, 'r', encoding='utf-8') as f:
        return f.read()

def chunk_text(text, chunk_size=1024):
    tokens = tokenizer.tokenize(text)
    chunks = [tokenizer.convert_tokens_to_string(tokens[i:i+chunk_size])
              for i in range(0, len(tokens), chunk_size)]
    return chunks

8) parallel file processing

In [None]:
def process_file(fname):
    j_path = os.path.join(judgment_dir, fname)
    s_path = os.path.join(summary_dir, fname)
    if os.path.exists(j_path) and os.path.exists(s_path):
        judgment = read_file(j_path)
        summary = read_file(s_path)
        chunks = chunk_text(judgment, 1024)
        return [(chunk, summary) for chunk in chunks]
    return []

def preprocess_parallel(file_list, workers=4):
    with Pool(processes=workers) as pool:
        results = pool.map(process_file, file_list)
    return [item for sublist in results for item in sublist]

9) save and load utilities

In [None]:
def save_data(data, path):
    with open(path, 'wb') as f:
        pickle.dump(data, f)

def load_data(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

10) start fine tuning

In [None]:
model_name = "facebook/bart-base"
# "facebook/bart-base" # OR "google/pegasus-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

11) checkpoint - 1

In [None]:
checkpoint_dir = "/content/drive/MyDrive/talqs dataset/summarize dataset"
os.makedirs(checkpoint_dir, exist_ok=True)

start_epoch = 0
checkpoint_path = os.path.join(checkpoint_dir, "checkpoint.pt")
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    start_epoch = checkpoint["epoch"] + 1
    print(f"Loaded checkpoint from epoch {start_epoch}")
else:
    print("No checkpoint found")

12) dataset class

In [None]:
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_input_len=1024, max_target_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_target_len = max_target_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        src_text, tgt_text = self.data[idx]
        input_enc = self.tokenizer(
            src_text,
            max_length=self.max_input_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        target_enc = self.tokenizer(
            tgt_text,
            max_length=self.max_target_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        input_ids = input_enc['input_ids'].squeeze(0)
        attention_mask = input_enc['attention_mask'].squeeze(0)
        labels = target_enc['input_ids'].squeeze(0)
        labels[labels == tokenizer.pad_token_id] = -100
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }

13) loading

In [None]:
train_path = "/content/drive/MyDrive/talqs dataset/summarize dataset/train_data.pkl"
test_path = "/content/drive/MyDrive/talqs dataset/summarize dataset/test_data.pkl"

if os.path.exists(train_path) and os.path.exists(test_path):
    print("Loading cached datasets...")
    train_data = load_data(train_path)
    test_data = load_data(test_path)
else:
    print("Preprocessing datasets in parallel...")
    train_data = preprocess_parallel(train_files, workers=4)
    test_data = preprocess_parallel(test_files, workers=4)

    save_data(train_data, train_path)
    save_data(test_data, test_path)
    print("Saved preprocessed datasets")

14) dataset class loading

In [None]:
train_dataset = SummarizationDataset(train_data, tokenizer)
test_dataset = SummarizationDataset(test_data, tokenizer)

15) data loaders

In [None]:
train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=4, num_workers=2, pin_memory=True)

16) checkpoint-2

In [None]:
def save_checkpoint(epoch, model, optimizer, scaler, lr_scheduler, checkpoint_path):
    torch.save({
        "epoch": epoch,
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "scaler_state_dict": scaler.state_dict(),
        "scheduler_state_dict": lr_scheduler.state_dict()
    }, checkpoint_path)
    print(f"Checkpoint saved at epoch {epoch + 1}")

17) hyper parameters included
(optimizer type, learning rate, warmup steps, scheduler type, number of epochs, gradient clipping value)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 4
lr_scheduler = get_scheduler("linear", optimizer, num_warmup_steps=500, num_training_steps=num_training_steps)
scaler = GradScaler()

epochs = 10
grad_clip = 1.0
model.train()

if os.path.exists(checkpoint_path):
    optimizer.load_state_dict(checkpoint["optimizer_state_dict"])
    scaler.load_state_dict(checkpoint["scaler_state_dict"])
    lr_scheduler.load_state_dict(checkpoint["scheduler_state_dict"])

18) training loop

In [None]:
train_losses = []

for epoch in range(start_epoch, epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    total_loss = 0

    for batch in tqdm(train_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        scaler.step(optimizer)
        scaler.update()
        lr_scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)

    print(f"\nTraining Loss: {avg_loss:.3f}")

    save_checkpoint(epoch, model, optimizer, scaler, lr_scheduler, checkpoint_path)

final_model_path = "/content/drive/MyDrive/talqs dataset/summarize dataset/model_final.pth"
torch.save(model.state_dict(), final_model_path)
print(f"\nModel saved to: {final_model_path}")

19) plotting graph

In [None]:
plt.figure(figsize=(8, 5))
plt.plot(range(start_epoch + 1, epochs + 1), train_losses, marker='o', color='blue', label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss per Epoch')
plt.grid(True)
plt.legend()
plt.tight_layout()
plt.show()

20) generating batch wise

In [None]:
def batch_generate_chunks(chunks, batch_size=8):
    summaries = []
    for i in range(0, len(chunks), batch_size):
        batch_chunks = chunks[i:i + batch_size]
        inputs = tokenizer(batch_chunks, return_tensors="pt", truncation=True, padding=True, max_length=1024).to(device)

        with torch.no_grad():
            summary_ids = model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=128,
                num_beams=4,
                early_stopping=True
            )

        decoded = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
        summaries.extend(decoded)
    return summaries

21) generate and save test dats

In [None]:
test_output_dir = "/content/drive/MyDrive/talqs dataset/summarize dataset/output"
os.makedirs(test_output_dir, exist_ok=True)

print("\nGenerating test summaries...")
predictions = []
references = []

model.eval()

for fname in tqdm(test_files):
    judgment_path = os.path.join(judgment_dir, fname)
    summary_path = os.path.join(summary_dir, fname)
    if not os.path.exists(judgment_path) or not os.path.exists(summary_path):
        continue

    judgment = read_file(judgment_path)
    reference_summary = read_file(summary_path)

    judgment_chunks = chunk_text(judgment, 1024)

    predicted_chunks = batch_generate_chunks(judgment_chunks, batch_size=8)

    full_summary = ' '.join(predicted_chunks)

    output_path = os.path.join(test_output_dir, fname)
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(full_summary)

    predictions.append(full_summary)
    references.append(reference_summary)

22) ROGUE scores evaluation

In [None]:
print("Evaluating ROUGE Scores on test set...")
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
total_scores = [scorer.score(ref, pred) for ref, pred in zip(references, predictions)]

def average_rouge(scores_list, metric):
    return sum([s[metric].fmeasure for s in scores_list]) / len(scores_list)

print("Average ROUGE-1:", average_rouge(total_scores, 'rouge1'))
print("Average ROUGE-2:", average_rouge(total_scores, 'rouge2'))
print("Average ROUGE-L:", average_rouge(total_scores, 'rougeL'))

23) BERT score evaluation

In [None]:
P, R, F1 = bert_score(predictions, references, lang="en", model_type="roberta-base", batch_size=1, verbose=True)
print("Evaluating BERT Scores on test set...")
print(f"Average Precision: {P.mean().item():.4f}")
print(f"Average Recall:    {R.mean().item():.4f}")
print(f"Average F1:        {F1.mean().item():.4f}")